diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /juniper/src |
Publish
Diffstat (limited to 'juniper/src')
85 files changed, 12242 insertions, 0 deletions
diff --git a/juniper/src/Doxyfile b/juniper/src/Doxyfile new file mode 100644 index 00000000000..99f89d421d4 --- /dev/null +++ b/juniper/src/Doxyfile @@ -0,0 +1,931 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +# Doxyfile 1.2.14 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# General configuration options +#--------------------------------------------------------------------------- + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = Juniper + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = ../doc/doxygen + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Brazilian, Chinese, Croatian, Czech, Danish, Dutch, Finnish, French, +# German, Greek, Hungarian, Italian, Japanese, Korean, Norwegian, Polish, +# Portuguese, Romanian, Russian, Slovak, Slovene, Spanish and Swedish. + +OUTPUT_LANGUAGE = English + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = YES + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these class will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = YES + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all inherited +# members of a class in the documentation of that class as if those members were +# ordinary class members. Constructors, destructors and assignment operators of +# the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = NO + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. It is allowed to use relative paths in the argument list. + +STRIP_FROM_PATH = + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower case letters. If set to YES upper case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# users are adviced to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like the Qt-style comments (thus requiring an +# explict @brief command for a brief description. + +JAVADOC_AUTOBRIEF = YES + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# reimplements. + +INHERIT_DOCS = YES + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 8 + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +#ENABLED_SECTIONS = simple \ +# juniper \ +# textproc +# utils + +ENABLED_SECTIONS = juniper + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consist of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. +# For instance some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. + +WARN_FORMAT = + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +# Result processing interface: +# +INPUT = rpinterface.h query.h IJuniperProperties.h dpinterface.h + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx *.hpp +# *.h++ *.idl *.odl + +FILE_PATTERNS = *.h \ + *.hpp \ + *.txt + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or directories +# that are symbolic links (a Unix filesystem feature) are excluded from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. + +EXCLUDE_PATTERNS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command <filter> <input-file>, where <filter> +# is the value of the INPUT_FILTER tag, and <input-file> is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. + +INPUT_FILTER = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse. + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# If the REFERENCED_BY_RELATION tag is set to YES (the default) +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = YES + +# If the REFERENCES_RELATION tag is set to YES (the default) +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = NO + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet + +HTML_STYLESHEET = + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compressed HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = YES + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = YES + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the Html help documentation and to the tree view. + +TOC_EXPAND = NO + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be +# generated containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript and frames is required (for instance Mozilla, Netscape 4.0+, +# or Internet explorer 4.0+). Note that for large projects the tree generation +# can take a very long time. In such cases it is better to disable this feature. +# Windows users are probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = YES + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4wide + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimised for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assigments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = YES + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_XML = NO + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_PREDEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_PREDEF_ONLY tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line and do not end with a semicolon. Such function macros are typically +# used for boiler-plate code, and will confuse the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::addtions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES tag can be used to specify one or more tagfiles. + +TAGFILES = ../../fastos/doc/doxygen/fastos.tag + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = juniper.tag + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in Html, RTF and LaTeX) for classes with base or +# super classes. Setting the tag to NO turns the diagrams off. Note that this +# option is superceded by the HAVE_DOT option below. This is only a fallback. It is +# recommended to install and use dot, since it yield more powerful graphs. + +CLASS_DIAGRAMS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = NO + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = YES + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are gif, jpg, and png +# If left blank gif will be used. + +DOT_IMAGE_FORMAT = gif + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found on the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width +# (in pixels) of the graphs generated by dot. If a graph becomes larger than +# this value, doxygen will try to truncate the graph, so that it fits within +# the specified constraint. Beware that most browsers cannot cope with very +# large images. + +MAX_DOT_GRAPH_WIDTH = 1024 + +# The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height +# (in pixels) of the graphs generated by dot. If a graph becomes larger than +# this value, doxygen will try to truncate the graph, so that it fits within +# the specified constraint. Beware that most browsers cannot cope with very +# large images. + +MAX_DOT_GRAPH_HEIGHT = 1024 + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermedate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES + +#--------------------------------------------------------------------------- +# Configuration::addtions related to the search engine +#--------------------------------------------------------------------------- + +# The SEARCHENGINE tag specifies whether or not a search engine should be +# used. If set to NO the values of all tags below this one will be ignored. + +SEARCHENGINE = NO + +# The CGI_NAME tag should be the name of the CGI script that +# starts the search engine (doxysearch) with the correct parameters. +# A script with this name will be generated by doxygen. + +CGI_NAME = search.cgi + +# The CGI_URL tag should be the absolute URL to the directory where the +# cgi binaries are located. See the documentation of your http daemon for +# details. + +CGI_URL = + +# The DOC_URL tag should be the absolute URL to the directory where the +# documentation is located. If left blank the absolute path to the +# documentation, with file:// prepended to it, will be used. + +DOC_URL = + +# The DOC_ABSPATH tag should be the absolute path to the directory where the +# documentation is located. If left blank the directory on the local machine +# will be used. + +DOC_ABSPATH = + +# The BIN_ABSPATH tag must point to the directory where the doxysearch binary +# is installed. + +BIN_ABSPATH = + +# The EXT_DOC_PATHS tag can be used to specify one or more paths to +# documentation generated for other projects. This allows doxysearch to search +# the documentation for these projects as well. + +EXT_DOC_PATHS = diff --git a/juniper/src/rpclient/testclient.rc b/juniper/src/rpclient/testclient.rc new file mode 100644 index 00000000000..deaacabaec2 --- /dev/null +++ b/juniper/src/rpclient/testclient.rc @@ -0,0 +1,68 @@ +## Dynamic teasers +## Some sensible default values +## This file is used by the testclient application by default, +## if run from this directory. +## Use the -f option to testclient to specify an alternate location + +# A string to be included before each hit in the generated summary +juniper.dynsum.highlight_on <b> + +# A string to be included after each hit in the generated summary +juniper.dynsum.highlight_off </b> + +# A string to be included to denote abbreviated/left out pieces of the +# original text in the generated summary +juniper.dynsum.continuation ... + +# whether or not Juniper should escape the 5 chars <>&"' +# auto means escape if any of the markup defs above starts with < +juniper.dynsum.escape_markup auto + +# Length of the generated summary in bytes. This is a hint to Juniper. +# The result may be slightly longer or shorter depending on the structure +# of the available document text and the submitted query. +juniper.dynsum.length 256 + +# The number of (possibly partial) set of keywords matching the query +# to try to include in the summary. The larger this value compared is +# set relative to the length parameter, the more dense the keywords +# may appear in the summary. +juniper.dynsum.max_matches 3 + +# The maximal number of bytes of context to prepend and append to each +# of the selected query keyword hits. This parameter defines the max +# size a summary would become if there are few keyword hits (max_matches +# set low or document contained few matches of the keywords. +juniper.dynsum.surround_max 128 + +# The size of the sliding window used to determine if +# multiple query terms occur together. The larger the value, the more +# likely the system will find (and present in dynamic summary) complete +# matches containing all the search terms. The downside is a potential +# performance overhead of keeping candidates for matches longer during +# matching, and consequently updating more candidates that eventually +# gets thrown +juniper.matcher.winsize 600 + +# The minimal number of bytes in a query keyword for it to be subject +# to the simple Juniper stemming algorithm. Keywords that are shorter +# than or equal to this limit will only yield exact matches in the +# dynamic summaries. +juniper.stem.min_length 5 + +# The maximal number of bytes in a query keyword for it to be subject +# to the simple Juniper stemming algorithm +juniper.stem.max_extend 3 + +# A factor to multiply the internal Juniper metric with when producing +# proximity metric for a given field. A real/floating point value accepted +# Default value is 0.25 - scaling down by a factor 4. +# Note that the QRserver also supports a factor that is global to all proximity +# metric fields, and that is applied in addition when proximityboosting +# is enabled there. This parameter applies to Juniper version >= 2.0.4 only. +# and is intended to be used on a per field basis. +juniper.proximity.factor 0.25 + +# debugging Juniper (intended for internal usage) +# (See juniperdebug.h for details about the various bits) +juniper.debug_mask 0x0 diff --git a/juniper/src/test/.gitignore b/juniper/src/test/.gitignore new file mode 100644 index 00000000000..46b307da632 --- /dev/null +++ b/juniper/src/test/.gitignore @@ -0,0 +1,16 @@ +*.log +*Suite +*Test +*suite +*test +.depend +Makefile +dummylib +semantic.cache +juniper_appender_test_app +juniper_queryvisitor_test_app +juniper_SrcTestSuite_app +juniper_auxTest_app +juniper_matchobjectTest_app +juniper_mcandTest_app +juniper_queryparserTest_app diff --git a/juniper/src/test/CMakeLists.txt b/juniper/src/test/CMakeLists.txt new file mode 100644 index 00000000000..285e705f42b --- /dev/null +++ b/juniper/src/test/CMakeLists.txt @@ -0,0 +1,66 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(juniper_mcandTest_app + SOURCES + mcandTest.cpp + mcandTestApp.cpp + testenv.cpp + DEPENDS + juniper +) +vespa_add_test(NAME juniper_mcandTest_app COMMAND juniper_mcandTest_app) +vespa_add_executable(juniper_queryparserTest_app + SOURCES + queryparserTest.cpp + queryparserTestApp.cpp + fakerewriter.cpp + testenv.cpp + DEPENDS + juniper +) +vespa_add_test(NAME juniper_queryparserTest_app COMMAND juniper_queryparserTest_app) +vespa_add_executable(juniper_matchobjectTest_app + SOURCES + matchobjectTest.cpp + matchobjectTestApp.cpp + testenv.cpp + fakerewriter.cpp + DEPENDS + juniper +) +vespa_add_test(NAME juniper_matchobjectTest_app COMMAND juniper_matchobjectTest_app) +vespa_add_executable(juniper_appender_test_app + SOURCES + appender_test.cpp + DEPENDS + juniper +) +vespa_add_test(NAME juniper_appender_test_app COMMAND juniper_appender_test_app) +vespa_add_executable(juniper_queryvisitor_test_app + SOURCES + queryvisitor_test.cpp + DEPENDS + juniper +) +vespa_add_test(NAME juniper_queryvisitor_test_app COMMAND juniper_queryvisitor_test_app) +vespa_add_executable(juniper_auxTest_app + SOURCES + auxTest.cpp + auxTestApp.cpp + testenv.cpp + DEPENDS + juniper +) +vespa_add_test(NAME juniper_auxTest_app COMMAND juniper_auxTest_app) +vespa_add_executable(juniper_SrcTestSuite_app + SOURCES + mcandTest.cpp + queryparserTest.cpp + fakerewriter.cpp + SrcTestSuite.cpp + matchobjectTest.cpp + auxTest.cpp + testenv.cpp + DEPENDS + juniper +) +vespa_add_test(NAME juniper_SrcTestSuite_app COMMAND juniper_SrcTestSuite_app) diff --git a/juniper/src/test/SrcTestSuite.cpp b/juniper/src/test/SrcTestSuite.cpp new file mode 100644 index 00000000000..d725278a009 --- /dev/null +++ b/juniper/src/test/SrcTestSuite.cpp @@ -0,0 +1,69 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * Implementation of the test suite application SrcTestSuite. + * + * @file SrcTestSuite.cpp + * + * @author Knut Omang + * + * @date Created 21 Feb 2003 + * + * $Id$ + * + * <pre> + * Copyright (c) : 2003 Fast Search & Transfer ASA + * ALL RIGHTS RESERVED + * </pre> + ****************************************************************************/ +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("SrcTestSuite"); +#include <vespa/fastlib/testsuite/suite.h> +#include "testenv.h" +#include "mcandTest.h" +#include "queryparserTest.h" +#include "matchobjectTest.h" +#include "auxTest.h" + +/** + * The SrcTestSuite class runs all the unit tests for the src module. + * + * @author Knut Omang + */ +class SrcTestSuite : public Suite { + +public: + SrcTestSuite(); +}; + +SrcTestSuite::SrcTestSuite() : + Suite("SrcTestSuite", &std::cout) +{ + // All tests for this module + AddTest(new MatchCandidateTest()); + AddTest(new MatchObjectTest()); + AddTest(new QueryParserTest()); + AddTest(new AuxTest()); +} + +/** + * The SrcTestSuiteApp class holds the main body for running the + * SrcTestSuite class. + * + * @author Knut Omang + */ +class SrcTestSuiteApp : public FastOS_Application { +public: + virtual int Main(); +}; + +int SrcTestSuiteApp::Main() { + juniper::TestEnv te(this, "../rpclient/testclient.rc"); + SrcTestSuite suite; + suite.Run(); + long failures = suite.Report(); + suite.Free(); + return (int)failures; +} + +FASTOS_MAIN(SrcTestSuiteApp); diff --git a/juniper/src/test/appender_test.cpp b/juniper/src/test/appender_test.cpp new file mode 100644 index 00000000000..a8202bf7a91 --- /dev/null +++ b/juniper/src/test/appender_test.cpp @@ -0,0 +1,62 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("appender_test"); +#include <vespa/vespalib/testkit/testapp.h> + +#define _NEED_SUMMARY_CONFIG_IMPL +#include <vespa/juniper/SummaryConfig.h> +#include <vespa/juniper/juniperdebug.h> +#include <vespa/juniper/appender.h> +#include <vespa/vespalib/stllike/string.h> +#include <vector> + +using namespace juniper; + +struct FixtureBase +{ + const char *_connectors; + SummaryConfig _cfg; + Appender _appender; + FixtureBase(ConfigFlag preserve_white_space) + : _connectors(""), + _cfg("[on]", "[off]", "[dots]", "\x1f", + reinterpret_cast<const unsigned char*>(_connectors), + ConfigFlag::CF_OFF, + preserve_white_space), + _appender(&_cfg) + { + } + void assertString(const vespalib::string &input, const vespalib::string &output) { + std::vector<char> buf; + _appender.append(buf, input.c_str(), input.size()); + EXPECT_EQUAL(output, vespalib::string(&buf[0], buf.size())); + } +}; + +struct DefaultFixture : public FixtureBase +{ + DefaultFixture() : FixtureBase(ConfigFlag::CF_OFF) {} +}; + +struct PreserveFixture : public FixtureBase +{ + PreserveFixture() : FixtureBase(ConfigFlag::CF_ON) {} +}; + +TEST_F("requireThatMultipleWhiteSpacesAreEliminated", DefaultFixture) +{ + f.assertString("text with\nwhite \nspace like this", + "text with white space like this"); +} + +TEST_F("requireThatMultipleWhiteSpacesArePreserved", PreserveFixture) +{ + f.assertString("text with\nwhite \nspace like this", + "text with\nwhite \nspace like this"); +} + +TEST_MAIN() +{ + TEST_RUN_ALL(); +} diff --git a/juniper/src/test/auxTest.cpp b/juniper/src/test/auxTest.cpp new file mode 100644 index 00000000000..7c53b2a7999 --- /dev/null +++ b/juniper/src/test/auxTest.cpp @@ -0,0 +1,947 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(""); + +#include "auxTest.h" + +// Using separator definitions only from here: + +#define COLOR_HIGH_ON "\e[1;31m" +#define COLOR_HIGH_OFF "\e[0m" + +#ifndef FASTOS_DEBUG +static int debug_level = 0; +#endif + +bool color_highlight = false; +bool verbose = false; +const unsigned char* connectors = reinterpret_cast<const unsigned char*>("-'"); + +using juniper::SpecialTokenRegistry; + +AuxTest::AuxTest() : Test("Auxiliary"), test_methods_(), _sumconf(0) +{ + init(); +} + +AuxTest::~AuxTest() +{ + DeleteSummaryConfig(_sumconf); +} + + +void AuxTest::init() +{ + test_methods_["TestExample"] = + &AuxTest::TestExample; + test_methods_["TestPropertyMap"] = + &AuxTest::TestPropertyMap; + test_methods_["TestRerase"] = + &AuxTest::TestRerase; + test_methods_["TestUTF811"] = + &AuxTest::TestUTF811; + test_methods_["TestUTF812"] = + &AuxTest::TestUTF812; + test_methods_["TestDoubleWidth"] = + &AuxTest::TestDoubleWidth; + test_methods_["TestPartialUTF8"] = + &AuxTest::TestPartialUTF8; + test_methods_["TestLargeBlockChinese"] = + &AuxTest::TestLargeBlockChinese; + test_methods_["TestUTF8context"] = + &AuxTest::TestUTF8context; + test_methods_["TestJapanese"] = + &AuxTest::TestJapanese; + test_methods_["TestStartHits"] = + &AuxTest::TestStartHits; + test_methods_["TestEndHit"] = + &AuxTest::TestEndHit; + test_methods_["TestJuniperStack"] = + &AuxTest::TestJuniperStack; + test_methods_["TestSpecialTokenRegistry"] = + &AuxTest::TestSpecialTokenRegistry; + test_methods_["TestWhiteSpacePreserved"] = + &AuxTest::TestWhiteSpacePreserved; +} + + +// needed closures + +void AuxTest::TestUTF811() +{ + TestUTF8(11); +} + +void AuxTest::TestUTF812() +{ + TestUTF8(12); +} + + +int +countBrokenUTF8(const char *data, uint32_t len) +{ + int broken = 0; + int remain = 0; + + for (uint32_t i = 0; i < len; ++i) { + unsigned char val = data[i]; + switch (val & 0xc0) { + case 0xc0: // first char + remain = 1; + val <<= 2; + while ((val & 0x80) != 0) { + ++remain; + val <<= 1; + } + if (remain > 5) { + ++broken; + remain = 0; + } + break; + case 0x80: // continuation char + if (remain == 0) { + ++broken; + } else { + --remain; + } + break; + default: // single char + if (remain > 0) { + ++broken; + remain = 0; + } + break; + } + } + return broken; +} + +void +AuxTest::TestDoubleWidth() +{ + char input[17] = + "[\x1f\xef\xbd\x93\xef\xbd\x8f\xef\xbd\x8e\xef\xbd\x99\x1f]"; + + juniper::PropertyMap myprops; + myprops // no fallback, should get match + .set("juniper.dynsum.escape_markup", "off") + .set("juniper.dynsum.highlight_off", "</hi>") + .set("juniper.dynsum.continuation", "<sep />") + .set("juniper.dynsum.highlight_on", "<hi>"); + Fast_NormalizeWordFolder wf; + juniper::Juniper juniper(&myprops, &wf); + juniper::Config myConfig("best", juniper); + + juniper::QueryParser q("\xef\xbd\x93\xef\xbd\x8f\xef\xbd\x8e\xef\xbd\x99"); + juniper::QueryHandle qh(q, NULL, juniper.getModifier()); + juniper::Result* res = juniper::Analyse(&myConfig, &qh, + input, 17, 0, 0, 0); + _test(res != NULL); + + juniper::Summary* sum = juniper::GetTeaser(res, NULL); + (void) sum; + // this should work + // _test(sum->Length() != 0); + juniper::ReleaseResult(res); +} + + + +void +AuxTest::TestPartialUTF8() +{ + const int inputSize = 5769; // NB: update this if input is changed + char input[inputSize]; + FastOS_File file("partialutf8.input.utf8"); + _test(file.OpenReadOnly()); + _test(file.GetSize() == inputSize); + _test(file.Read(input, inputSize)); + _test(countBrokenUTF8(input, inputSize) == 0); + file.Close(); + + juniper::PropertyMap myprops; + myprops // config taken from vespa test case + .set("juniper.dynsum.escape_markup", "off") + .set("juniper.dynsum.highlight_off", "") + .set("juniper.dynsum.continuation", "") + .set("juniper.dynsum.fallback", "prefix") + .set("juniper.dynsum.highlight_on", ""); + Fast_NormalizeWordFolder wf; + juniper::Juniper juniper(&myprops, &wf); + juniper::Config myConfig("best", juniper); + + juniper::QueryParser q("ipod"); + juniper::QueryHandle qh(q, NULL, juniper.getModifier()); + juniper::Result* res = juniper::Analyse(&myConfig, &qh, + input, inputSize, 0, 0, 0); + _test(res != NULL); + + juniper::Summary* sum = juniper::GetTeaser(res, NULL); + _test(sum->Length() != 0); + + // check for partial/broken utf-8 + _test(countBrokenUTF8(sum->Text(), sum->Length()) == 0); + + juniper::ReleaseResult(res); +} + +void AuxTest::TestLargeBlockChinese() +{ + const int inputSize = 10410; // NB: update this if input is changed + char input[inputSize]; + FastOS_File file("largeblockchinese.input.utf8"); + _test(file.OpenReadOnly()); + _test(file.GetSize() == inputSize); + _test(file.Read(input, inputSize)); + _test(countBrokenUTF8(input, inputSize) == 0); + file.Close(); + + juniper::PropertyMap myprops; + myprops // config taken from reported bug + .set("juniper.dynsum.length", "50") + .set("juniper.dynsum.min_length", "20") + .set("juniper.dynsum.escape_markup", "off") + .set("juniper.dynsum.highlight_off", "") + .set("juniper.dynsum.continuation", "") + .set("juniper.dynsum.fallback", "prefix") + .set("juniper.dynsum.highlight_on", ""); + Fast_NormalizeWordFolder wf; + juniper::Juniper juniper(&myprops, &wf); + juniper::Config myConfig("best", juniper); + + juniper::QueryParser q("希望"); + juniper::QueryHandle qh(q, NULL, juniper.getModifier()); + juniper::Result* res = juniper::Analyse(&myConfig, &qh, + input, inputSize, 0, 0, 0); + _test(res != NULL); + + juniper::Summary* sum = juniper::GetTeaser(res, NULL); + _test(sum->Length() != 0); + + // check that the entire block of chinese data is not returned in the summary + _test(sum->Length() < 100); + + // check for partial/broken utf-8 + _test(countBrokenUTF8(sum->Text(), sum->Length()) == 0); + + juniper::ReleaseResult(res); +} + +void AuxTest::TestExample() +{ + juniper::QueryParser q("AND(consume,sleep,tree)"); + juniper::QueryHandle qh(q, NULL, juniper::_Juniper->getModifier()); + + // some content + const char* content = "the monkey consumes bananas and sleeps afterwards." + "&%#%&! cries the sleepy monkey and jumps down from the tree." + "the last token here is split across lines consumed"; + int content_len = strlen(content); + juniper::Result* res = + juniper::Analyse(juniper::TestConfig, + &qh, + content, content_len, + 0, 0, 0); + _test(res != NULL); + + res->Scan(); + Matcher& m = *res->_matcher; + _test(m.TotalMatchCnt(0) == 2 && m.ExactMatchCnt(0) == 0); + juniper::ReleaseResult(res); +} + + +void +AuxTest::TestPropertyMap() +{ + juniper::PropertyMap map; + IJuniperProperties *props = ↦ + map.set("foo", "bar").set("one", "two"); + _test(props->GetProperty("bogus") == NULL); + _test(strcmp(props->GetProperty("bogus", "default"), "default") == 0); + _test(strcmp(props->GetProperty("foo"), "bar") == 0); + _test(strcmp(props->GetProperty("one", "default"), "two") == 0); +} + + +void AuxTest::TestRerase() +{ + std::list<int> ls; + + for (int i = 0; i < 10; i++) + ls.push_back(i); + + for (std::list<int>::reverse_iterator rit = ls.rbegin(); + rit != ls.rend();) + { + if (*rit == 5 || *rit == 6) + { + // STL hackers heaven - puh this was cumbersome.. + std::list<int>::reverse_iterator new_it(ls.erase((++rit).base())); + rit = new_it; + } + else + ++rit; + } + + std::string s; + for (std::list<int>::iterator it = ls.begin(); + it != ls.end(); ++it) + s += ('0' + *it); + _test(s == std::string("01234789")); +} + +// Debug dump with positions for reference +void test_dump(const char* s, unsigned int len) +{ + printf("test_dump: length %u\n", len); + for (unsigned int i = 0; i < len;) + { + unsigned int start = i; + for (; i < len;) + { + if (s[i] < 0) { + printf("�"); + } else { + printf("%c", s[i]); + } + i++; + if (!(i % 100)) break; + } + printf("\n"); + i = start + 10; + for (; i < len && i % 100; i+= 10) + printf("%7s%3d", "", i); + printf("\n"); + } +} + + +void AuxTest::TestUTF8(unsigned int size) +{ + const char* s = u8"\u00e5pent s\u00f8k\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5"; + const unsigned char* p = (const unsigned char*)s; + + int moved = 0; + for (int i = 0; i < (int)size + 2; i++) + { + // Forward tests: + p = (const unsigned char*)(s + i); + moved = Fast_UnicodeUtil::UTF8move((const unsigned char*)s, size, p, +1); + LOG(spam, "forw. moved %d, pos %d", moved, i); + if (i == 0 || i == 8) + _test(moved == 2); + else if (i >= (int)size) + _test(moved == -1); + else + _test(moved == 1); + + // backward tests + p = (const unsigned char*)(s + i); + moved = Fast_UnicodeUtil::UTF8move((const unsigned char*)s, size, p, -1); + LOG(spam, "backw.moved %d, pos %d", moved, i); + if (i == 10 || i == 9 || i == 2) + _test(moved == 2); + else if (i == 0 || i > (int)size) + _test(moved == -1); + else + _test(moved == 1); + + // move-to-start tests: + p = (const unsigned char*)(s + i); + moved = Fast_UnicodeUtil::UTF8move((const unsigned char*)s, size, p, 0); + LOG(spam, "to-start.moved %d, pos %d", moved, i); + if (i == 9 || i == 1) + _test(moved == 1); + else if (i >= (int)size) + _test(moved == -1); + else + _test(moved == 0); + } + + // Assumption about equality of UCS4 IsWordChar and isalnum for + // ascii (c < 128) : + for (unsigned char c = 0; c < 128; c++) + { + const unsigned char* pc = &c; + ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(pc); + bool utf8res = Fast_UnicodeUtil::IsWordChar(u); + bool asciires = isalnum(c); + _test(utf8res == asciires); + if (utf8res != asciires) + fprintf(stderr, ":%c:%d != :%c:%d\n", u, utf8res, c, asciires); + } +} + + +void AuxTest::TestUTF8context() +{ + const char* iso_cont = u8"AND(m\u00b5ss,fast,s\u00f8kemotor,\u00e5relang)"; + juniper::QueryParser q(iso_cont); + juniper::QueryHandle qh(q, NULL, juniper::_Juniper->getModifier()); + + // some content + std::string s(u8"Fast leverer s\u00d8kemotorer og andre nyttige ting for \u00e5 finne frem p\u00e5 "); + s.append(u8"internett. Teknologien er basert p\u00e5 \u00c5relang"); + s += UNIT_SEPARATOR; + s.append(u8"norsk innsats og forskning i"); + s += GROUP_SEPARATOR; + s.append(u8"trondheimsmilj\u00f8et. M\u00b5ss med denne nye funksjonaliteten for \u00e5 vise frem"); + s += UNIT_SEPARATOR; + s.append(u8" beste forekomst av s\u00f8ket med s\u00f8kemotor til brukeren blir det enda bedre. "); + s.append(u8"Hvis bare UTF8-kodingen virker som den skal for tegn som tar mer enn \u00e9n byte."); + + juniper::Result* res = juniper::Analyse(juniper::TestConfig, &qh, s.c_str(), s.size(), 0, 0, 0); + _test(res != NULL); + + size_t charsize; + Matcher& m = *res->_matcher; + + res->Scan(); + _test(m.TotalMatchCnt(0) == 1 && m.ExactMatchCnt(0) == 1); + _test(m.TotalMatchCnt(1) == 1 && m.ExactMatchCnt(2) == 1); + _test(m.TotalMatchCnt(2) == 2 && m.ExactMatchCnt(2) == 1); + _test(m.TotalMatchCnt(3) == 1 && m.ExactMatchCnt(2) == 1); + + char separators[3]; + separators[0] = UNIT_SEPARATOR; + separators[1] = GROUP_SEPARATOR; + separators[2] = '\0'; + + if (color_highlight) + _sumconf = CreateSummaryConfig(COLOR_HIGH_ON, COLOR_HIGH_OFF, "...", separators, connectors); + else + _sumconf = CreateSummaryConfig("<hit>", "</hit>", "...", separators, connectors); + for (int i = 1; i <= 10; i++) + { + // Short summaries with many matches + test_summary(m, s.c_str(), s.size(), i*30, i / 3, i*10, charsize); + // fewer matches, longer summaries + test_summary(m, s.c_str(), s.size(), i*60, i / 6, i*20, charsize); + } + // Summary som er stort nok til � ta hele teksten + test_summary(m, s.c_str(), s.size(), 800, 100, 300, charsize); + // fprintf(stderr, "charsize %d s.size %d\n", charsize, s.size()); + _test(charsize == s.size() - 3 - 11); // Subtract eliminated separators and dual bytes + + // "Syke" settinger for summary: + test_summary(m, s.c_str(), s.size(), 10000, 0, 1000, charsize); + // fprintf(stderr, "charsize %d s.size %d\n", charsize, s.size()); + _test(charsize == s.size() - 3 - 11); // Subtract eliminated separators and dual bytes + + if (GetNumFailed() > 0 && debug_level > 0) + { + fprintf(stderr, "Characters in original text: %ld\n", s.size()); + test_dump(s.c_str(), s.size()); + m.dump_statistics(); + } + juniper::ReleaseResult(res); +} + + +const char* japanese_sep_ex = "。"; + +struct TermTextPair +{ + const char* term; + const char* text; +}; + +static TermTextPair testjap[] = +{ + // japanese string as term + { "ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™", + "this is some japanese: ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。 ending here" }, + + // HUGE japanese prefix and postfix and simple match in middle: + { "bond", + "ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。 bond ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。ç§ã¯ã‚¬ãƒ©ã‚¹ã‚’食ã¹ã‚‰ã‚Œã¾ã™ã€‚ãã‚Œã¯ç§ã‚’å‚·ã¤ã‘ã¾ã›ã‚“。" }, + { "japanese", "Simple。match。check。for。japanese。sep" }, + { "hit", " -. hit at start" }, + { "hit", "hit at end .,: " }, + { "hit", "---------------------------------------------------------------------------------------------------------------------this is a text that is long enough to generate a hit that does have dots on both sides ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; " }, + { NULL, NULL } +}; + + +void AuxTest::TestJapanese() +{ + for (int i = 0; testjap[i].term != NULL; i++) + { + const char* qstr = testjap[i].term; + juniper::QueryParser q(qstr); + juniper::QueryHandle qh(q, NULL, juniper::_Juniper->getModifier()); + + const char* content = testjap[i].text; + int content_len = strlen(content); + juniper::Result* res = juniper::Analyse(juniper::TestConfig, &qh, + content, content_len, + 0, 0, 0); + _test(res != NULL); + + size_t charsize; + Matcher& m = *res->_matcher; + + res->Scan(); + if (color_highlight) + _sumconf = CreateSummaryConfig(COLOR_HIGH_ON, COLOR_HIGH_OFF, "...", "", connectors); + else + _sumconf = CreateSummaryConfig("<hit>", "</hit>", "...", "", connectors); + + SummaryDesc* sumdesc = m.CreateSummaryDesc(256, 256, 4, 80); + _test(sumdesc != NULL); + if (!sumdesc) + return; + std::string sum = BuildSummary(content, content_len, sumdesc, _sumconf, charsize); + + switch (i) + { + case 0: + // Matching a multibyte sequence + _test(m.TotalMatchCnt(0) == 1 && m.ExactMatchCnt(0) == 1); + // printf("total %d exact %d\n", m.TotalMatchCnt(0),m.ExactMatchCnt(0)); + break; + case 1: + // Matching short word in loong multibyte sequence + _test(m.TotalMatchCnt(0) == 1 && m.ExactMatchCnt(0) == 1); + _test(sum.size() <= 400); + break; + case 2: + // Matching word in between multibyte separators + _test(m.TotalMatchCnt(0) == 1 && m.ExactMatchCnt(0) == 1); + break; + case 3: + // Check that result is the complete string (markup excluded) + _test(sum.size() - 11 == charsize); + // printf("sz %d charsz %d :%s:\n", sum.size(), charsize, sum.c_str()); + break; + case 4: + // Check that result is the complete string (markup excluded) + _test(sum.size() - 11 == charsize); + // printf("sz %d charsz %d :%s:\n", sum.size(), charsize, sum.c_str()); + break; + case 5: + // Check that we get no noise at the start or end of this + _test(sum.size() == 103 && charsize == 86); + // printf("sz %d charsz %d :%s:\n", sum.size(), charsize, sum.c_str()); + break; + default: + break; + } + juniper::ReleaseResult(res); + DeleteSummaryDesc(sumdesc); + DeleteSummaryConfig(_sumconf); + } +} + + +void AuxTest::test_summary(Matcher& m, const char* content, size_t content_len, + int size, int matches, int surround, size_t& charsize) +{ + SummaryDesc* sum = m.CreateSummaryDesc(size, size, matches, surround); + _test(sum != NULL); + if (!sum) + { + // No summary generated! + return; + } + std::string res = BuildSummary(content, content_len, sum, _sumconf, charsize); + + if ((verbose || GetNumFailed() > 0) && debug_level > 0) { + printf("\nRequested size: %d, matches: %d, surround: %d, Summary size %lu :%s:\n", + size, matches, surround, static_cast<unsigned long>(res.size()), res.c_str()); + } + DeleteSummaryDesc(sum); +} + + +class DefProps : public IJuniperProperties +{ +public: + virtual const char* GetProperty(const char*, const char* def) + { + return def; + } +}; + + +void AuxTest::TestStartHits() +{ + juniper::QueryParser q("elvis"); + juniper::QueryHandle qh(q, "dynlength.120", juniper::_Juniper->getModifier()); + + const char* content = + "Elvis, this is a long match before matching Elvis again and then som more text at" + " the end. But this text at the end must be much longer than this to trigger the case." + " In fact it must be much longer. And then som more text at the end. But this text at " + "the end must be much longer than this to trigger the case"; + int content_len = strlen(content); + juniper::Result* res = juniper::Analyse(juniper::TestConfig, &qh, + content, content_len, + 0, 0, 0); + _test(res != NULL); + + juniper::Summary* sum = juniper::GetTeaser(res, NULL); + (void) sum; + // TODO: ReEnable _test(sum->Length() != 0); + juniper::ReleaseResult(res); +} + + +void AuxTest::TestEndHit() +{ + juniper::QueryParser q("match"); + juniper::QueryHandle qh(q, "dynlength.120", juniper::_Juniper->getModifier()); + + const char* content = + "In this case we need a fairly long text that does not fit entirely into the resulting" + " summary, but that has a hit towards the end of the document where the expected length" + " extends the end of the doc. This means that the prefix must be more than 256 bytes" + " long. Here is the stuff we are looking for to match in a case where we have " + "surround_len bytes closer than good towardstheend�����������������������������������"; + size_t content_len = strlen(content) - 55; + + juniper::Result* res = juniper::Analyse(juniper::TestConfig, &qh, + content, content_len, + 0, 0, 0); + _test(res != NULL); + + juniper::Summary* sum = juniper::GetTeaser(res, NULL); + _test(sum->Length() != 0); + juniper::ReleaseResult(res); +} + + + +class TokenChecker : public ITokenProcessor +{ +private: + TokenChecker(const TokenChecker&); + TokenChecker& operator= (const TokenChecker&); + + Token* _out; + int i; +public: + TokenChecker(Token* output) : _out(output), i(0) + { } + + virtual void handle_token(Token& token) + { + _out[i] = token; + i++; + } + + virtual void handle_end(Token&) {} +}; + + +void AuxTest::TestJuniperStack() +{ + // Stack simplification tests + QueryExpr* q = new QueryNode(1, 0, 0); + QueryExpr* q1 = new QueryNode(1, 0, 0); + QueryExpr* q2 = new QueryTerm("Hepp", 4, 0); + q->AddChild(q1); + q1->AddChild(q2); + + SimplifyStack(q); + + std::string s; + q->Dump(s); + _test(strcmp(s.c_str(),"Hepp:100") == 0); + delete q; + + if (GetNumFailed() > 0) + fprintf(stderr, "TestJuniperStack: %s\n", s.c_str()); + + q = new QueryNode(2, 0, 0); + q->_arity = 0; + SimplifyStack(q); + std::string s1; + _test(q == NULL); + + if (GetNumFailed() > 0) + fprintf(stderr, "TestJuniperStack: %s\n", s.c_str()); +} + +class TokenProcessor : public ITokenProcessor { +private: + const std::string & _text; + std::vector<std::string> _tokens; +public: + TokenProcessor(const std::string & text) : _text(text), _tokens() {} + virtual void handle_token(Token & t) { + _tokens.push_back(std::string(_text.c_str() + t.bytepos, t.bytelen)); + //LOG(info, "handle_token(%s): bytepos(%d), wordpos(%d), bytelen(%d), curlen(%d)", + //_tokens.back().c_str(), + //(int)t.bytepos, (int)t.wordpos, t.bytelen, t.curlen); + } + virtual void handle_end(Token & t) { + _tokens.push_back(std::string(_text.c_str() + t.bytepos, t.bytelen)); + //LOG(info, "handle_end(%s): bytepos(%d), wordpos(%d), bytelen(%d), curlen(%d)", + //_tokens.back().c_str(), + //(int)t.bytepos, (int)t.wordpos, t.bytelen, t.curlen); + } + void clearTokens() { _tokens.clear(); } + const std::vector<std::string> & getTokens() const { return _tokens; } +}; + + +bool +AuxTest::assertChar(ucs4_t act, char exp) +{ + //LOG(info, "assertChar(%d(%c), %c)", act, (char)act, exp); + return _test((char) act == exp); +} + +void +AuxTest::TestSpecialTokenRegistry() +{ + { + typedef SpecialTokenRegistry::CharStream CharStream; + ucs4_t buf[16]; + { + std::string text = " c+-"; + CharStream cs(text.c_str(), text.c_str() + text.size(), buf, buf + 16); + _test(!cs.isStartWordChar()); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), ' ')); + _test(cs.hasMoreChars()); + cs.reset(); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), ' ')); + _test(assertChar(cs.getNextChar(), 'c')); + _test(cs.hasMoreChars()); + cs.reset(); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), ' ')); + _test(assertChar(cs.getNextChar(), 'c')); + _test(assertChar(cs.getNextChar(), '+')); + _test(cs.hasMoreChars()); + cs.reset(); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), ' ')); + _test(assertChar(cs.getNextChar(), 'c')); + _test(assertChar(cs.getNextChar(), '+')); + _test(assertChar(cs.getNextChar(), '-')); + _test(!cs.hasMoreChars()); + cs.reset(); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), ' ')); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), 'c')); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), '+')); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), '-')); + _test(!cs.hasMoreChars()); + } + { // test reset with increase to next char + std::string text = " c+-"; + CharStream cs(text.c_str(), text.c_str() + text.size(), buf, buf + 16); + _test(cs.resetAndInc()); + _test(cs.isStartWordChar()); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), 'c')); + _test(assertChar(cs.getNextChar(), '+')); + _test(assertChar(cs.getNextChar(), '-')); + _test(!cs.hasMoreChars()); + cs.reset(); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), 'c')); + _test(assertChar(cs.getNextChar(), '+')); + _test(assertChar(cs.getNextChar(), '-')); + _test(!cs.hasMoreChars()); + _test(cs.resetAndInc()); + _test(!cs.isStartWordChar()); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), '+')); + _test(assertChar(cs.getNextChar(), '-')); + _test(!cs.hasMoreChars()); + _test(cs.resetAndInc()); + _test(!cs.isStartWordChar()); + _test(cs.hasMoreChars()); + _test(assertChar(cs.getNextChar(), '-')); + _test(!cs.hasMoreChars()); + _test(!cs.resetAndInc()); + _test(!cs.hasMoreChars()); + } + { // test lower case + std::string text = "C"; + CharStream cs(text.c_str(), text.c_str() + text.size(), buf, buf + 16); + _test(assertChar(cs.getNextChar(), 'c')); + } + } + { // test tokenizer with special token registry + typedef std::unique_ptr<QueryNode> QueryNodeUP; + struct QB { + QueryNodeUP q; + QB(size_t numTerms) : q(new QueryNode(numTerms, 0, 0)) {} + QB(QB & rhs) : q(std::move(rhs.q)) { } + QB & add(const char * t, bool st = true) { + QueryTerm * qt = new QueryTerm(t, strlen(t), 0); + if (st) qt->_options |= X_SPECIALTOKEN; + q->AddChild(qt); + return *this; + } + }; + struct Ctx { + std::string text; + QB qb; + SpecialTokenRegistry str; + Fast_NormalizeWordFolder wf; + TokenProcessor tp; + JuniperTokenizer jt; + Ctx(const std::string & text_, QB & qb_) : text(text_), qb(qb_), str(qb.q.get()), wf(), tp(text), jt(&wf, text.c_str(), text.size(), &tp, &str) { jt.scan(); } + }; + + { // only special token registered + Ctx c("foo", QB(2).add("c++").add("foo", false)); + _test(c.str.getSpecialTokens().size() == 1); + } + { // various matches + std::string annotation = "\357\277\271dvdplusminus\357\277\272dvd+-\357\277\273"; + std::string text = "c++ !my C++ text ?.net dvd+- stuff " + annotation; + Ctx c(text, QB(3).add("c++").add(".net").add("dvd+-", false)); + _test(c.str.getSpecialTokens().size() == 2); + _test(c.tp.getTokens().size() == 9); + _test(c.tp.getTokens()[0] == "c++"); + _test(c.tp.getTokens()[1] == "my"); + _test(c.tp.getTokens()[2] == "C++"); + _test(c.tp.getTokens()[3] == "text"); + _test(c.tp.getTokens()[4] == ".net"); + _test(c.tp.getTokens()[5] == "dvd"); + _test(c.tp.getTokens()[6] == "stuff"); + _test(c.tp.getTokens()[7] == annotation); + _test(c.tp.getTokens()[8] == ""); + } + { // cannot start inside a word + Ctx c("foo ac++", QB(1).add("c++")); + _test(c.tp.getTokens().size() == 3); + _test(c.tp.getTokens()[0] == "foo"); + _test(c.tp.getTokens()[1] == "ac"); + _test(c.tp.getTokens()[2] == ""); + } + { // can end inside a word (TODO: can be fixed if it is a problem) + Ctx c("++ca foo", QB(1).add("++c")); + _test(c.tp.getTokens().size() == 4); + _test(c.tp.getTokens()[0] == "++c"); + _test(c.tp.getTokens()[1] == "a"); + _test(c.tp.getTokens()[2] == "foo"); + _test(c.tp.getTokens()[3] == ""); + } + { // many scans but only match at the end + Ctx c("a+b- a+b+c- a+b+c+", QB(1).add("a+b+c+")); + _test(c.tp.getTokens().size() == 7); + _test(c.tp.getTokens()[0] == "a"); + _test(c.tp.getTokens()[1] == "b"); + _test(c.tp.getTokens()[2] == "a"); + _test(c.tp.getTokens()[3] == "b"); + _test(c.tp.getTokens()[4] == "c"); + _test(c.tp.getTokens()[5] == "a+b+c+"); + _test(c.tp.getTokens()[6] == ""); + } + { // two special tokens (one being a substring of the other) + Ctx c("c+c+c-", QB(2).add("c+c+c+").add("+c+")); + _test(c.tp.getTokens().size() == 4); + _test(c.tp.getTokens()[0] == "c"); + _test(c.tp.getTokens()[1] == "+c+"); + _test(c.tp.getTokens()[2] == "c"); + _test(c.tp.getTokens()[3] == ""); + } + { // cjk + Ctx c("fish: \xE9\xB1\xBC!", QB(1).add("\xE9\xB1\xBC!")); + _test(c.tp.getTokens().size() == 3); + _test(c.tp.getTokens()[0] == "fish"); + _test(c.tp.getTokens()[1] == "\xE9\xB1\xBC!"); + _test(c.tp.getTokens()[2] == ""); + } + { // special token with non-word first + Ctx c("+++c ..net", QB(2).add("++c").add(".net")); + _test(c.tp.getTokens().size() == 3); + _test(c.tp.getTokens()[0] == "++c"); + _test(c.tp.getTokens()[1] == ".net"); + _test(c.tp.getTokens()[2] == ""); + } + } +} + +void +AuxTest::TestWhiteSpacePreserved() +{ + vespalib::string input = "\x1f" + "best" + "\x1f" + " " + "\x1f" + "of" + "\x1f" + " " + "\n" + "\x1f" + "metallica" + "\x1f"; + + juniper::PropertyMap myprops; + myprops.set("juniper.dynsum.escape_markup", "off") + .set("juniper.dynsum.highlight_off", "</hi>") + .set("juniper.dynsum.continuation", "<sep />") + .set("juniper.dynsum.highlight_on", "<hi>") + .set("juniper.dynsum.preserve_white_space", "on"); + Fast_NormalizeWordFolder wf; + juniper::Juniper juniper(&myprops, &wf); + juniper::Config myConfig("myconfig", juniper); + + juniper::QueryParser q("best"); + juniper::QueryHandle qh(q, NULL, juniper.getModifier()); + juniper::Result* res = juniper::Analyse(&myConfig, &qh, input.c_str(), input.size(), 0, 0, 0); + _test(res != NULL); + + juniper::Summary* sum = juniper::GetTeaser(res, NULL); + vespalib::string expected = "<hi>best</hi> of \nmetallica"; + vespalib::string actual(sum->Text(), sum->Length()); + _test(actual == expected); + juniper::ReleaseResult(res); +} + +void AuxTest::Run(MethodContainer::iterator &itr) { + try { + (this->*itr->second)(); + } catch (...) { + _fail("Got unknown exception in test method " + itr->first); + } +} + +void AuxTest::Run(const char* method) { + MethodContainer::iterator pos(test_methods_.find(method)); + if (pos != test_methods_.end()) { + Run(pos); + } else { + std::cerr << "ERROR: No test method named \"" + << method << "\"" << std::endl; + _fail("No such method"); + } +} + +void AuxTest::Run() { + for (MethodContainer::iterator itr(test_methods_.begin()); + itr != test_methods_.end(); + ++itr) + Run(itr); +} + + +void AuxTest::Run(int argc, char* argv[]) +{ + for (int i = 1; i < argc; ++i) + { + if (strcmp(argv[i], "-m") == 0 && argc > i + 1) + { + Run(argv[++i]); + return; + } + } + Run(); +} diff --git a/juniper/src/test/auxTest.h b/juniper/src/test/auxTest.h new file mode 100644 index 00000000000..43b89058b16 --- /dev/null +++ b/juniper/src/test/auxTest.h @@ -0,0 +1,66 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +// Auxiliary tests for juniper - based on Juniper 1.x proximitytest.cpp + +#include <map> +#include <vespa/fastlib/testsuite/test.h> +#include "testenv.h" + + +class AuxTest : public Test +{ +private: + AuxTest(const AuxTest&); + AuxTest& operator=(const AuxTest&); +public: + AuxTest(); + virtual ~AuxTest(); + + typedef void(AuxTest::* tst_method_ptr) (); + typedef std::map<std::string, tst_method_ptr> MethodContainer; + MethodContainer test_methods_; + void init(); + + void Run(MethodContainer::iterator &itr); + void Run(const char* method); + void Run(int argc, char* argv[]); + virtual void Run(); +protected: + /** + * Since we are running within Emacs, the default behavior of + * print_progress which includes backspace does not work. + * We'll use a single '.' instead. + */ + virtual void print_progress() { *m_osptr << '.' << std::flush; } +private: + // tests: + void TestPropertyMap(); + void TestRerase(); + void TestExample(); + void TestUTF8context(); + void TestJapanese(); + void TestStartHits(); + void TestEndHit(); + void TestJuniperStack(); + void TestUTF811(); + void TestUTF812(); + void TestDoubleWidth(); + void TestPartialUTF8(); + void TestLargeBlockChinese(); + void TestSpecialTokenRegistry(); + void TestWhiteSpacePreserved(); + + bool assertChar(ucs4_t act, char exp); + + // Utilities + char* IsoToUtf8 (const char* iso, size_t size); + char* Utf8ToIso (const char* iso, size_t size); + void test_summary(Matcher& m, const char* input, size_t input_len, + int size, int matches, int surround, size_t& charsize); + void TestUTF8(unsigned int size); + + bool _split_char; + SummaryConfig* _sumconf; +}; + diff --git a/juniper/src/test/auxTestApp.cpp b/juniper/src/test/auxTestApp.cpp new file mode 100644 index 00000000000..7b37cc909bb --- /dev/null +++ b/juniper/src/test/auxTestApp.cpp @@ -0,0 +1,30 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("auxTest"); +#include "auxTest.h" + +class AuxTestApp : public FastOS_Application +{ +public: + virtual int Main(); +}; + + + +void Usage(char* s) +{ + fprintf(stderr, "Usage: %s [-d debug_level]\n", s); +} + + +int AuxTestApp::Main() +{ + juniper::TestEnv te(this, "../rpclient/testclient.rc"); + AuxTest pta; + pta.SetStream(&std::cout); + pta.Run(_argc, _argv); + return pta.Report(); +} + +FASTOS_MAIN(AuxTestApp); diff --git a/juniper/src/test/fakerewriter.cpp b/juniper/src/test/fakerewriter.cpp new file mode 100644 index 00000000000..16e3bd0ee7a --- /dev/null +++ b/juniper/src/test/fakerewriter.cpp @@ -0,0 +1,64 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(""); +#include "fakerewriter.h" +#include <vespa/vespalib/util/stringfmt.h> + +namespace juniper +{ + +struct RewriteHandle +{ + RewriteHandle(std::string& in, uint32_t langid) + : _s(in), _ls(""), _cnt(0), _langid(langid) {} + + std::string& next() + { + if (_cnt > 3 || _langid > 4) + _ls = ""; + else + _ls = vespalib::make_string("%s%d", _s.c_str(), _cnt++); + return _ls; + } + std::string _s; + std::string _ls; + int _cnt; + uint32_t _langid; +}; +} // end namespace juniper + +using namespace juniper; + +const char* FakeRewriter::Name() const +{ + return _name.c_str(); +} + + +RewriteHandle* FakeRewriter::Rewrite(uint32_t langid, const char* term) +{ + std::string t(term); + if (langid > 4) return NULL; + return new RewriteHandle(t, langid); +} + +RewriteHandle* FakeRewriter::Rewrite(uint32_t langid, const char* term, size_t length) +{ + std::string t(term, length); + if (langid > 4) return NULL; + return new RewriteHandle(t, langid); +} + + +const char* FakeRewriter::NextTerm(RewriteHandle* exp, size_t& length) +{ + std::string& t = exp->next(); + if (t.size() == 0) + { + delete exp; + return NULL; + } + length = t.size(); + return t.c_str(); +} diff --git a/juniper/src/test/fakerewriter.h b/juniper/src/test/fakerewriter.h new file mode 100644 index 00000000000..8d09de56644 --- /dev/null +++ b/juniper/src/test/fakerewriter.h @@ -0,0 +1,18 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/juniper/rewriter.h> +#include <string> + +class FakeRewriter: public juniper::IRewriter +{ +public: + FakeRewriter() : _name() {} + virtual const char* Name() const; + virtual juniper::RewriteHandle* Rewrite(uint32_t langid, const char* term); + virtual juniper::RewriteHandle* Rewrite(uint32_t langid, const char* term, size_t length); + virtual const char* NextTerm(juniper::RewriteHandle* exp, size_t& length); +private: + std::string _name; +}; + diff --git a/juniper/src/test/largeblockchinese.input.utf8 b/juniper/src/test/largeblockchinese.input.utf8 new file mode 100644 index 00000000000..9b85e8d06e6 --- /dev/null +++ b/juniper/src/test/largeblockchinese.input.utf8 @@ -0,0 +1 @@ +我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待希望希望希望希望希望希望希望我åªèƒ½æœŸå¾…期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期待期 diff --git a/juniper/src/test/matchobjectTest.cpp b/juniper/src/test/matchobjectTest.cpp new file mode 100644 index 00000000000..43335ee1d72 --- /dev/null +++ b/juniper/src/test/matchobjectTest.cpp @@ -0,0 +1,421 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * Implementation of the automated unit test class for the MatchObject + * class. + * + * @file matchobjectTest.cpp + * + * @author Knut Omang + * + * @date Created 21 Feb 2003 + * + * $Id$ + * + * <pre> + * Copyright (c) : 2003 Fast Search & Transfer ASA + * ALL RIGHTS RESERVED + * </pre> + ***************************************************************************/ +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(""); +#include "matchobjectTest.h" +#include "fakerewriter.h" + +// Comment out cerr below to ignore unimplemented tests +#define NOTEST(name) \ +std::cerr << std::endl << __FILE__ << ':' << __LINE__ << ": " \ + << "No test for method '" << (name) << "'" << std::endl; + +/************************************************************************* + * Test methods + * + * This section contains boolean methods for testing each public method + * in the class being tested + *************************************************************************/ + +/** + * Test of the Term method. + */ +void MatchObjectTest::testTerm() { + // Test that two equal keywords are matched properly: + TestQuery q("NEAR/2(word,PHRASE(near,word))"); + + const char* content = "This is a small text with word appearing near word"; + size_t content_len = strlen(content); + + // Fetch a result descriptor: + Result* res = juniper::Analyse(juniper::TestConfig, &q._qhandle, + content, content_len, + 0, 0, 0); + _test(res != 0); + + // Do the scanning manually. This calls accept several times + res->Scan(); + Matcher& m = *res->_matcher; + + _test(m.TotalHits() == 3);// 3 occurrences + match_candidate_set& ms = m.OrderedMatchSet(); + + _test(ms.size() == 2); + + delete res; + // printf("%d %d\n", m.TotalHits(),ms.size()); + TestQuery q1("t*t"); + TestQuery q2("*ea*"); + TestQuery q3("*d"); + TestQuery q4("*word"); + Result* r1 = juniper::Analyse(juniper::TestConfig, &q1._qhandle, content, content_len, 0, 0, 0); + Result* r2 = juniper::Analyse(juniper::TestConfig, &q2._qhandle, content, content_len, 0, 0, 0); + Result* r3 = juniper::Analyse(juniper::TestConfig, &q3._qhandle, content, content_len, 0, 0, 0); + Result* r4 = juniper::Analyse(juniper::TestConfig, &q4._qhandle, content, content_len, 0, 0, 0); + if (r1 != 0) + { + r1->Scan(); + _test(r1->_matcher->TotalHits() == 1); + delete r1; + } + else + _test(r1 != 0); + + if (r2 != 0) + { + r2->Scan(); + _test(r2->_matcher->TotalHits() == 2); + delete r2; + } + else + _test(r2 != 0); + + if (r3 != 0) + { + r3->Scan(); + _test(r3->_matcher->TotalHits() == 2); + delete r3; + } + else + _test(r3 != 0); + + if (r4 != 0) + { + r4->Scan(); + _test_equal(r4->_matcher->TotalHits(), 2); + delete r4; + } + else + _test(r4 != 0); +} + +/** + * Test of the Match method. + */ +void MatchObjectTest::testMatch() { + // Check that we hit on the longest match first + juniper::QueryParser p("AND(junipe,juniper)"); + juniper::QueryHandle qh(p, NULL, juniper::_Juniper->getModifier()); + + MatchObject* mo = qh.MatchObj(0); + juniper::Result res(juniper::TestConfig, &qh, "", 0, 0); + unsigned opts = 0; + match_iterator mi(mo, &res); + ucs4_t ucs4_str[10]; + Fast_UnicodeUtil::ucs4copy(ucs4_str, "junipers"); + Token token; + token.token = ucs4_str; + token.curlen = 8; + int idx = mo->Match(mi, token, opts); + _test(strcmp(mo->Term(idx)->term(),"juniper") == 0); + + { + // This test would loop in v.2.2.2 + TestQuery q("(word,"); + _test(q._qparser.ParseError()); + } + + { + // Test to trigger ticket #5734 Dev Data Search + std::string + doc("A simple document with an extremelylongwordhit in the middle of it that is" + "long enough to allow the error to be triggered extremelylongwordhit." + "A simple document with an extremelylongwordhit in the middle of it that is" + "long enough to allow the error to be triggered extremelylongwordhit." + "A simple document with an extremelylongwordhit in the middle of it that is" + "long enough to allow the error to be triggered extremelylongwordhit." + "A simple document with an extremelylongwordhit in the middle of it that is" + "long enough to allow the error to be triggered extremelylongwordhit." + "A simple document with an extremelylongwordhit in the middle of it that is" + "long enough to allow the error to be triggered extremelylongwordhit." + "A simple document with an extremelylongwordhit in the middle of it that is" + "long enough to allow the error to be triggered extremelylongwordhit." + "A simple document with an extremelylongwordhit in the middle of it that is" + "long enough to allow the error to be triggered extremelylongwordhit."); + TestQuery q("OR(OR(extremelylongwordhits,extremelylongwordhit,extremelylongwordhits," + "extremelylongwordhit,extremelylongwordhits,extremelylongwordhit," + "extremelylongwordhit,extremelylongwordhits,extremelylongwordhit," + "extremelylongwordhit,extremelylongwordhits,extremelylongwordhit," + "extremelylongwordhit,extremelylongwordhits,extremelylongwordhit," + "extremelylongwordhit,extremelylongwordhits,extremelylongwordhit," + "extremelylongwordhit))"); + QueryHandle& qh1(q._qhandle); + juniper::Result res1(juniper::TestConfig, &qh1, + doc.c_str(), doc.size(), 0); + juniper::Summary* sum = res1.GetTeaser(NULL); + std::string s(sum->Text()); + _test_equal(s, + "A simple document with an <b>extremelylongwordhit</b> in the middle" + " of it that islong enough to allow...triggered " + "<b>extremelylongwordhit</b>.A simple document with an " + "<b>extremelylongwordhit</b> in the middle of it that islong enough to allow..."); + } +} + +/** + * Test matching in annotated buffers + */ +void MatchObjectTest::testMatchAnnotated() { + const char *doc = "A big and ugly teaser about " + "\xEF\xBF\xB9" + "buying" + "\xEF\xBF\xBA" + "buy" + "\xEF\xBF\xBB" + " stuff"; + TestQuery q("AND(big,buy)"); + QueryHandle &qh1(q._qhandle); + juniper::Result res1(juniper::TestConfig, &qh1, + doc, strlen(doc), 0); + juniper::Summary *sum = res1.GetTeaser(NULL); + std::string s(sum->Text()); + + _test_equal(s, + "A <b>big</b> and ugly teaser about <b>" + "\xEF\xBF\xB9" + "buying" + "\xEF\xBF\xBA" + "buy" + "\xEF\xBF\xBB" + "</b> stuff"); +} + + +/** + * Test of the the expansion based (langid) constructor + */ +void MatchObjectTest::testLangid() +{ + FakeRewriter frew; + juniper::_Juniper->AddRewriter("exp", &frew, true, false); + juniper::_Juniper->AddRewriter("red", &frew, false, true); + juniper::_Juniper->AddRewriter("expred", &frew, true, true); + + TestQuery q("AND(exp:a,red:b1,expred:c)"); + QueryHandle& qh(q._qhandle); + + { + { + std::string stk; + qh.MatchObj(0)->Query()->Dump(stk); + _test_equal(stk, + "Node<a:3>[Node<a:4>[a0:100,a1:100,a2:100,a3:100]," + "b1:100,Node<a:4>[c0:100,c1:100,c2:100,c3:100]]"); + } + + std::string doc("see if we can match b or c somewhere in this a3 doc. " + "Note that we should not match b1 or c1 or a somewhere.."); + juniper::Result res(juniper::TestConfig, &qh, doc.c_str(), doc.size(),0); + + juniper::Summary* sum = res.GetTeaser(NULL); + std::string s(sum->Text()); + _test_equal(s, + "see if we can match <b>b</b> or <b>c</b> somewhere in this" + " <b>a3</b> doc. Note that we should not match b1 or c1 or a somewhere.."); + } + + { + // Do another test with the same query handle (testing reuse of qh with rewriters) + std::string doc("Try to run this on another doc just to see if b or c still can be" + " matched with the same query handle"); + juniper::Result res(juniper::TestConfig, &qh, + doc.c_str(), doc.size(), 0); + + juniper::Summary* sum = res.GetTeaser(NULL); + std::string s(sum->Text()); + _test_equal(s, + "Try to run this on another doc just to see if <b>b</b> or <b>c</b>" + " still can be matched with the same query handle"); + } + juniper::_Juniper->FlushRewriters(); +} + + +/** + * Test of the the expansion based (langid) constructor in + * combination with a normal search + */ +void MatchObjectTest::testCombined() +{ + FakeRewriter frew; + juniper::_Juniper->AddRewriter("exp", &frew, true, false); + juniper::_Juniper->AddRewriter("red", &frew, false, true); + + TestQuery q("OR(OR(AND(exp:a,b)))"); + QueryHandle& qh(q._qhandle); + + { + std::string doc("see if we can match a3 or c somewhere in this b doc. " + "Note that we should not match b1 or c1 or a somewhere.."); + juniper::Result res(juniper::TestConfig, &qh, doc.c_str(), doc.size(), 0); + + juniper::Summary* sum = res.GetTeaser(NULL); + std::string s(sum->Text()); + _test_equal(s, + "see if we can match <b>a3</b> or c somewhere in this <b>b</b> doc." + " Note that we should not match b1 or c1 or a somewhere.."); + } + juniper::_Juniper->FlushRewriters(); +} + +/** Test parameter input via options + */ + +void MatchObjectTest::testParams() +{ + { + TestQuery q("AND(a,b)", "near.1"); + QueryHandle& qh = q._qhandle; + std::string stk; + qh.MatchObj(0)->Query()->Dump(stk); + // Expect l:1 == limit:1 v: Validity check of keywords needed, c: Completeness req'ed + _test_equal(stk, "Node<a:2,l:1,v,c>[a:100,b:100]"); + } + + { + TestQuery q("AND(a,b)", "onear.1"); + QueryHandle& qh = q._qhandle; + std::string stk; + qh.MatchObj(0)->Query()->Dump(stk); + // Expect l:1 == limit:1 o: ordered, v: Validity check of keywords needed, + // c: Completeness req'ed + _test_equal(stk, "Node<a:2,o,l:1,v,c>[a:100,b:100]"); + } + + { + TestQuery q("AND(a,b)", "within.1"); + QueryHandle& qh = q._qhandle; + std::string stk; + qh.MatchObj(0)->Query()->Dump(stk); + // Expect l:1 == limit:1 o: ordered, v: Validity check of keywords needed, + // c: Completeness req'ed + _test_equal(stk, "Node<a:2,o,l:1,v,c>[a:100,b:100]"); + } + + { + // Check that query option replaces orig.query + TestQuery q("OR(a,b)", "query.ONEAR/1(a,b)"); + QueryHandle& qh = q._qhandle; + std::string stk; + qh.MatchObj(0)->Query()->Dump(stk); + // Expect l:1 == limit:1 o: ordered, v: Validity check of keywords needed, + // c: Completeness req'ed + _test_equal(stk, "Node<a:2,o,l:1,v,c>[a:100,b:100]"); + } + + { + // Check that query option replaces orig.query, and check that ANY works.. + TestQuery q("OR(a,b,c)", "query.ANY(a,b)"); + QueryHandle& qh = q._qhandle; + std::string stk; + qh.MatchObj(0)->Query()->Dump(stk); + // Expect l:1 == limit:1 o: ordered, v: Validity check of keywords needed, + // c: Completeness req'ed + _test_equal(stk, "Node<a:2>[a:100,b:100]"); + } +} + + +/************************************************************************* + * Test administration methods + *************************************************************************/ + +/** + * Set up common stuff for all test methods. + * This method is called immediately before each test method is called + */ +bool MatchObjectTest::setUp() { + return true; +} + +/** + test_methods_["testCombined"] = + &MatchObjectTest::testCombined; + * Tear down common stuff for all test methods. + * This method is called immediately after each test method is called + */ +void MatchObjectTest::tearDown() { +} + +/** + * Build up a map with all test methods + */ +void MatchObjectTest::init() { + test_methods_["testTerm"] = + &MatchObjectTest::testTerm; + test_methods_["testMatch"] = + &MatchObjectTest::testMatch; + test_methods_["testMatchAnnotated"] = + &MatchObjectTest::testMatchAnnotated; + test_methods_["testLangid"] = + &MatchObjectTest::testLangid; + test_methods_["testCombined"] = + &MatchObjectTest::testCombined; + test_methods_["testParams"] = + &MatchObjectTest::testParams; +} + +/************************************************************************* + * main entry points + *************************************************************************/ + + +void MatchObjectTest::Run(MethodContainer::iterator &itr) { + try { + if (setUp()) { + (this->*itr->second)(); + tearDown(); + } + } catch (...) { + _fail("Got unknown exception in test method " + itr->first); + } +} + +void MatchObjectTest::Run(const char* method) { + MethodContainer::iterator pos(test_methods_.find(method)); + if (pos != test_methods_.end()) { + Run(pos); + } else { + std::cerr << "ERROR: No test method named \"" + << method << "\"" << std::endl; + _fail("No such method"); + } +} + +void MatchObjectTest::Run() { + for (MethodContainer::iterator itr(test_methods_.begin()); + itr != test_methods_.end(); + ++itr) + Run(itr); +} + +/* + * Parse runtime arguments before running. + * If the -m METHOD parameter is given, run only that method + */ +void MatchObjectTest::Run(int argc, char* argv[]) { + for (int i = 1; i < argc; ++i) { + if (strcmp(argv[i], "-m") == 0 && argc > i + 1) { + Run(argv[++i]); + return; + } + } + Run(); +} diff --git a/juniper/src/test/matchobjectTest.h b/juniper/src/test/matchobjectTest.h new file mode 100644 index 00000000000..fb2c4bc0578 --- /dev/null +++ b/juniper/src/test/matchobjectTest.h @@ -0,0 +1,120 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * Definition of the automated unit test class for the MatchObject class. + * + * @file matchobjectTest.h + * + * @author Knut Omang + * + * @date Created 21 Feb 2003 + * + * $Id$ + * + * <pre> + * Copyright (c) : 2003 Fast Search & Transfer ASA + * ALL RIGHTS RESERVED + * </pre> + ***************************************************************************/ +#pragma once + +#include <map> +#include <vespa/fastlib/testsuite/test.h> +#include "testenv.h" + +/** + * The MatchObjectTest class holds + * the unit tests for the MatchObject class. + * + * @sa MatchObject + * @author Knut Omang + */ +class MatchObjectTest : public Test { + + /************************************************************************* + * Test methods + * + * This section contains boolean methods for testing each public method + * in the class ing tested + *************************************************************************/ + + /** + * Test of the Term method. + */ + void testTerm(); + + + /** + * Test of performance + */ + void testPerformance(); + + /** + * Test of the Match method. + */ + void testMatch(); + + /** + * Test of the Match method on annotated buffers. + */ + void testMatchAnnotated(); + + /** + * Test of the the expansion based (langid) constructor + */ + void testLangid(); + void testCombined(); + + + /** Test parameter input via query handle options + */ + void testParams(); + + + /************************************************************************* + * Test administration methods + *************************************************************************/ + + /** + * Set up common stuff for all test methods. + * This method is called immediately before each test method is called + */ + bool setUp(); + + /** + * Tear down common stuff for all test methods. + * This method is called immediately after each test method is called + */ + void tearDown(); + + typedef void(MatchObjectTest::* tst_method_ptr) (); + typedef std::map<std::string, tst_method_ptr> MethodContainer; + MethodContainer test_methods_; + void init(); + +protected: + + /** + * Since we are running within Emacs, the default behavior of + * print_progress which includes backspace does not work. + * We'll use a single '.' instead. + */ + virtual void print_progress() { *m_osptr << '.' << std::flush; } + +public: + + MatchObjectTest() : Test("MatchObject"), test_methods_() { init(); } + ~MatchObjectTest() {} + + /************************************************************************* + * main entry points + *************************************************************************/ + void Run(MethodContainer::iterator &itr); + virtual void Run(); + void Run(const char *method); + void Run(int argc, char* argv[]); +}; + + +// Local Variables: +// mode:c++ +// End: diff --git a/juniper/src/test/matchobjectTestApp.cpp b/juniper/src/test/matchobjectTestApp.cpp new file mode 100644 index 00000000000..ada38fa1747 --- /dev/null +++ b/juniper/src/test/matchobjectTestApp.cpp @@ -0,0 +1,42 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * Definition and implementation of the application for running unit tests + * for the MatchObject class in isolation. + * + * @file matchobjectTestApp.cpp + * + * @author Knut Omang + * + * @date Created 21 Feb 2003 + * + * $Id$ + * + * <pre> + * Copyright (c) : 2003 Fast Search & Transfer ASA + * ALL RIGHTS RESERVED + * </pre> + ****************************************************************************/ +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("matchobjectTest"); +#include "matchobjectTest.h" +#include "testenv.h" + +/** + * The MatchObjectTestApp class is the main routine for running the unit + * tests for the MatchObject class in isolation. + * + * @sa MatchObject @author Knut Omang + */ +class MatchObjectTestApp : public FastOS_Application { +public: + virtual int Main() { + juniper::TestEnv te(this, "../rpclient/testclient.rc"); + MatchObjectTest test; + test.SetStream(&std::cout); + test.Run(_argc, _argv); + return (int)test.Report(); + } +}; + +FASTOS_MAIN(MatchObjectTestApp); diff --git a/juniper/src/test/mcandTest.cpp b/juniper/src/test/mcandTest.cpp new file mode 100644 index 00000000000..3da6b452ace --- /dev/null +++ b/juniper/src/test/mcandTest.cpp @@ -0,0 +1,659 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * Implementation of the automated unit test class for the MatchCandidate + * class. + * + * @file mcandTest.cpp + * + * @author Knut Omang + * + * @date Created 27 Feb 2003 + * + * $Id$ + * + * <pre> + * Copyright (c) : 2003 Fast Search & Transfer ASA + * ALL RIGHTS RESERVED + * </pre> + ***************************************************************************/ +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(""); +#include "mcandTest.h" +#include "testenv.h" + +// Comment out cerr below to ignore unimplemented tests +#define NOTEST(name) \ + std::cerr << std::endl << __FILE__ << ':' << __LINE__ << ": " \ + << "No test for method '" << (name) << "'" << std::endl; + + +MatchCandidateTest::MatchCandidateTest() : + Test("MatchCandidate"), test_methods_() +{ init(); } + +/************************************************************************* + * Test methods + * + * This section contains boolean methods for testing each public method + * in the class being tested + *************************************************************************/ + +/** + * Test of the SetDocid method. + */ +void MatchCandidateTest::testSetDocid() { +// NOTEST("SetDocid"); +} + + +/** + * Test that the empty query is handled properly even for Analyse and + * GetTeaser/GetRelevancy/GetLog calls.. (Fastserver < 4.21 semantics) + */ +void MatchCandidateTest::testLog() { + TestQuery q(""); + std::string content("Here we go hepp and then some words away hoi some silly text here"); + + Result* res = juniper::Analyse(juniper::TestConfig, + &q._qhandle, + content.c_str(), content.size(), + 0, 0, 0); + _test(res); // We get a result handle + _test(!res->_mo); // but it is empty + + juniper::Summary* sum = juniper::GetTeaser(res); + std::string s(sum->Text()); + _test_equal(s, std::string("")); + + long relevance = juniper::GetRelevancy(res); + _test_equal(relevance, PROXIMITYBOOST_NOCONSTRAINT_OFFSET); + + sum = juniper::GetLog(res); + s = sum->Text(); + _test_equal(s, std::string("")); + juniper::ReleaseResult(res); +} + + +/** + * Test of proximity metric = 0 + */ +void MatchCandidateTest::testDump() { + std::string content("Here we go hepp and then some words away hoi"); + + { + TestQuery q("NEAR/1(hepp,hoi)"); + Result* res = juniper::Analyse(juniper::TestConfig, + &q._qhandle, + content.c_str(), content.size(), + 0, 0, 0); + _test(res != NULL); + long relevance = juniper::GetRelevancy(res); + // zero value since there are no hits and constraints are enabled.. + _test_equal(relevance, 0); + juniper::ReleaseResult(res); + } + + { + TestQuery q("OR(NEAR/1(hepp,hoi),bananas)"); + Result* res = juniper::Analyse(juniper::TestConfig, + &q._qhandle, + content.c_str(), content.size(), + 0, 0, 0); + _test(res != NULL); + long relevance = juniper::GetRelevancy(res); + // Check that X_CONSTR propagates as intended + _test_equal(relevance, 0); + juniper::ReleaseResult(res); + } + + { + TestQuery q("PHRASE(hepp,hoi)"); + Result* res = juniper::Analyse(juniper::TestConfig, + &q._qhandle, + content.c_str(), content.size(), + 0, 0, 0); + _test(res != NULL); + long relevance = juniper::GetRelevancy(res); + // constant value since there are no hits but this is + // also not a constrained search.. + _test_equal(relevance, PROXIMITYBOOST_NOCONSTRAINT_OFFSET); + juniper::ReleaseResult(res); + } + + { + TestQuery q("AND(hepp,hoi)"); + Result* res = juniper::Analyse(juniper::TestConfig, + &q._qhandle, + content.c_str(), content.size(), + 0, 0, 0); + _test(res != NULL); + long relevance = juniper::GetRelevancy(res); + // Relevance may change, but nice to discover such changes.. + // The important is that we get a nonzero value here as a hit + _test_equal(relevance, 4470); + juniper::ReleaseResult(res); + } +} + + +/** + * Test of the order method. + */ +void MatchCandidateTest::testorder() { + TestQuery q("PHRASE(test,phrase)"); + + const char* content = "This is a simple text where a phrase match can be found not" + " quite adjacent to a test phrase work"; + size_t content_len = strlen(content); + + // Fetch a result descriptor: + Result* res = juniper::Analyse(juniper::TestConfig, + &q._qhandle, + content, content_len, + 0, 0, 0); + _test(res != 0); + + // Do the scanning manually. Scan calls accept several times + res->Scan(); + Matcher& m = *res->_matcher; + + _test(m.TotalHits() == 3); // 3 occurrences + + match_candidate_set& ms = m.OrderedMatchSet(); + + _test(ms.size() == 1); + juniper::ReleaseResult(res); +} + + +/** + * Test of the matches_limit method. + */ +void MatchCandidateTest::testMatches_limit() { + TestQuery q("OR(PHRASE(phrase,match),PHRASE(test,word))"); + + const char* content = "This is a simple text where a phrase match can be found not" + " quite adjacent to a test word"; + size_t content_len = strlen(content); + + // Fetch a result descriptor: + Result* res = juniper::Analyse(juniper::TestConfig, + &q._qhandle, + content, content_len, + 0, 0, 0); + _test(res != 0); + + // Do the scanning manually. This calls accept several times + res->Scan(); + Matcher& m = *res->_matcher; + + _test(m.TotalHits() == 4);// 3 occurrences + + match_candidate_set& ms = m.OrderedMatchSet(); + + _test(ms.size() == 2); // The first (complete) match and the second starting at "test" + + // Check if we get the correct teaser as well.. + juniper::Summary* sum = juniper::GetTeaser(res); + _test(strcmp(sum->Text(), + "This is a simple text where a <b>phrase</b> <b>match</b> can be found not" + " quite adjacent to a <b>test</b> <b>word</b>") == 0); + juniper::ReleaseResult(res); +} + + +/** + * Test of the accept method. + */ +void MatchCandidateTest::testAccept() { + TestQuery q("AND(simple,test)"); + + const char* content = "This is a simple test where we should get a perfect match"; + size_t content_len = strlen(content); + + // Fetch a result descriptor: + Result* res = juniper::Analyse(juniper::TestConfig, + &q._qhandle, + content, content_len, + 0, 0, 0); + _test(res != 0); + + // Do the scanning manually. This calls accept several times + res->Scan(); + Matcher& m = *res->_matcher; + + _test(m.TotalHits() == 2); // 2 overlapping candidate starting points + _test(m.QueryTerms() == 2); // 2 query terms + + match_candidate_set& ms = m.OrderedMatchSet(); + + _test(ms.size() > 0); + + if (!ms.size()) { + juniper::ReleaseResult(res); + return; // No point in continuing.. + } + + MatchCandidate& mc = *(*(ms.begin())); + + _test(mc.elems() == 2); + _test(mc.startpos() == 10); + _test(mc.endpos() == 21); + _test(!mc.order()); // Unordered for AND op + _test(mc.ctxt_startpos() == 0); + + mc.make_keylist(); + _test(mc._klist.size() == 2); // Two occurrence elements in list + + // Just for the sake of it, verify that we get a proper teaser out of this also.. + juniper::Summary* sum = juniper::GetTeaser(res); + _test(strcmp(sum->Text(), + "This is a <b>simple</b> <b>test</b> where we should get a perfect match") == 0); + juniper::ReleaseResult(res); +} + + +/** + * Test of the rank method. + */ +void MatchCandidateTest::testRank() { +// NOTEST("rank"); +} + + +/** + * Test of simple nested query + */ +void MatchCandidateTest::testMake_keylist() { + TestQuery q("OR(AND(phrase,match),AND(test,phrase))"); + + const char* content = "This is a simple text where a phrase match can be found not" + " quite adjacent to a test phrase"; + size_t content_len = strlen(content); + + // Fetch a result descriptor: + Result* res = juniper::Analyse(juniper::TestConfig, + &q._qhandle, + content, content_len, + 0, 0, 0); + _test(res != 0); + + // Do the scanning manually. This calls accept several times + res->Scan(); + Matcher& m = *res->_matcher; + + _test(m.TotalHits() == 4);// 3 occurrences + + match_candidate_set& ms = m.OrderedMatchSet(); + + _test_equal(static_cast<size_t>(ms.size()), 6u); + + juniper::ReleaseResult(res); +} + + +/** + * Test of the add_to_keylist method. + */ +void MatchCandidateTest::testAdd_to_keylist() { + // Nested NEAR-test (triggered if nested NEAR with PHRASE) Ticket Dev Data Search 6109 + TestQuery q("NEAR/4(PHRASE(phr1,phr2),PHRASE(phr3,phr4))"); + + const char* content = "connect truende. phr1 phr2 www www www phr3 phr4 acuicola 8844"; + size_t content_len = strlen(content); + + // Fetch a result descriptor: + Result* res = juniper::Analyse(juniper::TestConfig, + &q._qhandle, + content, content_len, + 0, 0, 0); + _test(res != 0); + +// Do the scanning manually. This calls accept several times + res->Scan(); + Matcher& m = *res->_matcher; + + _test(m.TotalHits() == 4);// 4 occurrences + + match_candidate_set& ms = m.OrderedMatchSet(); + + _test_equal(static_cast<size_t>(ms.size()), 1u); // Single result + + // Bug triggered when result is fetched.. + juniper::Summary* sum = juniper::GetTeaser(res); + std::string s(sum->Text()); + _test_equal(s, + "connect truende. <b>phr1</b> <b>phr2</b> www www www <b>phr3</b>" + " <b>phr4</b> acuicola 8844"); + + juniper::ReleaseResult(res); +} + + +/** + * Test of the length method. + */ +void MatchCandidateTest::testLength() { + const char* content = "this simple text with adjacent words of a certain pattern must" + " be matched according to specific rules to be detailed in this test."; + size_t content_len = strlen(content); + + { + // Nested complex NEAR-test with double matches at same pos + TestQuery q("NEAR/4(pattern,NEAR/1(simple,with),NEAR/2(simple,adjacent))"); + + // Fetch a result descriptor: + Result* res = juniper::Analyse(juniper::TestConfig, &q._qhandle, + content, content_len, + 0, 0, 0); + + juniper::Summary* sum = juniper::GetTeaser(res); + Matcher& m = *res->_matcher; + match_candidate_set& ms = m.OrderedMatchSet(); + _test_equal(static_cast<size_t>(ms.size()), 1u); + + std::string s(sum->Text()); + _test_equal(s, + "this <b>simple</b> text <b>with</b> <b>adjacent</b> words of " + "a certain <b>pattern</b> must be matched according to specific" + " rules to be detailed in this test."); + juniper::ReleaseResult(res); + } + + { + // Nested complex NEAR-test with double matches at same pos should not yield hit with ONEAR + TestQuery q("ONEAR/4(pattern,NEAR/1(simple,with),NEAR/2(simple,adjacent))"); + + // Fetch a result descriptor: + Result* res = juniper::Analyse(juniper::TestConfig, + &q._qhandle + ,content, content_len, + 0, 0, 0); + + res->Scan(); + Matcher& m = *res->_matcher; + match_candidate_set& ms = m.OrderedMatchSet(); + _test_equal(static_cast<size_t>(ms.size()), 0u); + + juniper::ReleaseResult(res); + } + + { + // Likewise nested complex NEAR-test with double matches at same pos but just outside limit + // should not match: + TestQuery q("NEAR/4(pattern,NEAR/1(simple,with),NEAR/1(simple,adjacent))"); + + // Fetch a result descriptor: + Result* res = juniper::Analyse(juniper::TestConfig, &q._qhandle, + content, content_len, + 0, 0, 0); + + res->Scan(); + Matcher& m = *res->_matcher; + match_candidate_set& ms = m.OrderedMatchSet(); + _test_equal(static_cast<size_t>(ms.size()), 0u); + + juniper::ReleaseResult(res); + } +} + + +struct MyTokenProcessor : public ITokenProcessor +{ + Matcher &_m; + std::vector<size_t> _cands; + MyTokenProcessor(Matcher &m) : _m(m), _cands() {} + virtual void handle_token(Token &token) { + _m.handle_token(token); + const match_sequence *ms = _m.GetWorkSet(); + _cands.push_back(ms[0].size()); + LOG(info, "match_sequence[0].size(%zu)", _cands.back()); + } + virtual void handle_end(Token &token) { + _m.handle_end(token); + } +}; + + +/** + * Test that max number of match candidates can be controlled. + */ +void MatchCandidateTest::requireThatMaxNumberOfMatchCandidatesCanBeControlled() +{ + TestQuery q("PHRASE(re,re,re,re,foo,re,re,re,re,bar)"); + q._qhandle._max_match_candidates = 4; + + const char *content = "re re re re foo re re re re bar re re re re foo re re re re bar"; + size_t content_len = strlen(content); + + Result *res = juniper::Analyse(juniper::TestConfig, + &q._qhandle, + content, content_len, + 0, 0, 0); + _test(res != 0); + + // Deflect tokens to my processor + Matcher &m = *res->_matcher; + MyTokenProcessor proc(m); + res->_tokenizer->SetSuccessor(&proc); + res->Scan(); + + _test_equal(proc._cands.size(), 20u); + for (size_t i = 0; i < proc._cands.size(); ++i) { + _test(proc._cands[i] <= 4u); + } + _test_equal(m.TotalHits(), 20); + match_candidate_set& mcs = m.OrderedMatchSet(); + _test_equal(static_cast<size_t>(mcs.size()), 2u); + + juniper::ReleaseResult(res); +} + + +/** + * Test of the order method. + */ +void MatchCandidateTest::testOrder() { +// NOTEST("order"); +} + + +/** + * Test of the size method. + */ +void MatchCandidateTest::testSize() { +// NOTEST("size"); +} + + +/** + * Test of the endpos method. + */ +void MatchCandidateTest::testEndpos() { +// NOTEST("endpos"); +} + + +/** + * Test of the ctxt_startpos method. + */ +void MatchCandidateTest::testCtxt_startpos() { +// NOTEST("ctxt_startpos"); +} + + +/** + * Test of the starttoken method. + */ +void MatchCandidateTest::testStarttoken() { +// NOTEST("starttoken"); +} + + +/** + * Test of the word_distance method. + */ +void MatchCandidateTest::testWord_distance() { +// NOTEST("word_distance"); +} + + +/** + * Test of the distance method. + */ +void MatchCandidateTest::testDistance() { +// NOTEST("distance"); +} + + +/** + * Test of the elem_store_sz method. + */ +void MatchCandidateTest::testElem_store_sz() { +// NOTEST("elem_store_sz"); +} + + +/** + * Test of the elems method. + */ +void MatchCandidateTest::testElems() { +// NOTEST("elems"); +} + + +/** + * Test of the distance method. + */ +void MatchCandidateTest::testDistance1() { +// NOTEST("distance"); +} + + +/** + * Test of the set_valid method. + */ +void MatchCandidateTest::testSet_valid() { +// NOTEST("set_valid"); +} + + +/************************************************************************* + * Test administration methods + *************************************************************************/ + +/** + * Set up common stuff for all test methods. + * This method is called immediately before each test method is called + */ +bool MatchCandidateTest::setUp() { + return true; +} + +/** + * Tear down common stuff for all test methods. + * This method is called immediately after each test method is called + */ +void MatchCandidateTest::tearDown() { +} + +/** + * Build up a map with all test methods + */ +void MatchCandidateTest::init() { + test_methods_["testSetDocid"] = + &MatchCandidateTest::testSetDocid; + test_methods_["testLog"] = + &MatchCandidateTest::testLog; + test_methods_["testDump"] = + &MatchCandidateTest::testDump; + test_methods_["testorder"] = + &MatchCandidateTest::testorder; + test_methods_["testMatches_limit"] = + &MatchCandidateTest::testMatches_limit; + test_methods_["testAccept"] = + &MatchCandidateTest::testAccept; + test_methods_["testRank"] = + &MatchCandidateTest::testRank; + test_methods_["testMake_keylist"] = + &MatchCandidateTest::testMake_keylist; + test_methods_["testAdd_to_keylist"] = + &MatchCandidateTest::testAdd_to_keylist; + test_methods_["testLength"] = + &MatchCandidateTest::testLength; + test_methods_["requireThatMaxNumberOfMatchCandidatesCanBeControlled"] = + &MatchCandidateTest::requireThatMaxNumberOfMatchCandidatesCanBeControlled; + test_methods_["testOrder"] = + &MatchCandidateTest::testOrder; + test_methods_["testSize"] = + &MatchCandidateTest::testSize; + test_methods_["testEndpos"] = + &MatchCandidateTest::testEndpos; + test_methods_["testCtxt_startpos"] = + &MatchCandidateTest::testCtxt_startpos; + test_methods_["testStarttoken"] = + &MatchCandidateTest::testStarttoken; + test_methods_["testWord_distance"] = + &MatchCandidateTest::testWord_distance; + test_methods_["testDistance"] = + &MatchCandidateTest::testDistance; + test_methods_["testElem_store_sz"] = + &MatchCandidateTest::testElem_store_sz; + test_methods_["testElems"] = + &MatchCandidateTest::testElems; + test_methods_["testDistance1"] = + &MatchCandidateTest::testDistance1; + test_methods_["testSet_valid"] = + &MatchCandidateTest::testSet_valid; +} + +/************************************************************************* + * main entry points + *************************************************************************/ + + +void MatchCandidateTest::Run(MethodContainer::iterator &itr) { + try { + if (setUp()) { + (this->*itr->second)(); + tearDown(); + } + } catch (...) { + _fail("Got unknown exception in test method " + itr->first); + } +} + +void MatchCandidateTest::Run(const char* method) { + MethodContainer::iterator pos(test_methods_.find(method)); + if (pos != test_methods_.end()) { + Run(pos); + } else { + std::cerr << "ERROR: No test method named \"" + << method << "\"" << std::endl; + _fail("No such method"); + } +} + +void MatchCandidateTest::Run() { + for (MethodContainer::iterator itr(test_methods_.begin()); + itr != test_methods_.end(); + ++itr) + Run(itr); +} + +/* + * Parse runtime arguments before running. + * If the -m METHOD parameter is given, run only that method + */ +void MatchCandidateTest::Run(int argc, char* argv[]) { + for (int i = 1; i < argc; ++i) { + if (strcmp(argv[i], "-m") == 0 && argc > i + 1) + { + Run(argv[++i]); + return; + } + } + Run(); +} diff --git a/juniper/src/test/mcandTest.h b/juniper/src/test/mcandTest.h new file mode 100644 index 00000000000..52a8c626a15 --- /dev/null +++ b/juniper/src/test/mcandTest.h @@ -0,0 +1,218 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * Definition of the automated unit test class for the MatchCandidate + * class. + * + * @file mcandTest.h + * + * @author Knut Omang + * + * @date Created 27 Feb 2003 + * + * $Id$ + * + * <pre> + * Copyright (c) : 2003 Fast Search & Transfer ASA + * ALL RIGHTS RESERVED + * </pre> + ***************************************************************************/ +#pragma once + +#include <map> +#include <vespa/fastlib/testsuite/test.h> +#include "testenv.h" +#include <vespa/juniper/mcand.h> + +/** + * The MatchCandidateTest class holds + * the unit tests for the MatchCandidate class. + * + * @sa MatchCandidate + * @author Knut Omang + */ +class MatchCandidateTest : public Test { + + /************************************************************************* + * Test methods + * + * This section contains boolean methods for testing each public method + * in the class ing tested + *************************************************************************/ + + /** + * Test of the SetDocid method. + */ + void testSetDocid(); + + + /** + * Test of the log method. + */ + void testLog(); + + + /** + * Test of the dump method. + */ + void testDump(); + + + /** + * Test of the (order method. + */ + void testorder(); + + + /** + * Test of the matches_limit method. + */ + void testMatches_limit(); + + + /** + * Test of the accept method. + */ + void testAccept(); + + + /** + * Test of the rank method. + */ + void testRank(); + + + /** + * Test of the make_keylist method. + */ + void testMake_keylist(); + + + /** + * Test of the add_to_keylist method. + */ + void testAdd_to_keylist(); + + + /** + * Test of the length method. + */ + void testLength(); + + /** + * Test that the max number of match candidates can be controlled. + */ + void requireThatMaxNumberOfMatchCandidatesCanBeControlled(); + + /** + * Test of the order method. + */ + void testOrder(); + + + /** + * Test of the size method. + */ + void testSize(); + + + /** + * Test of the endpos method. + */ + void testEndpos(); + + + /** + * Test of the ctxt_startpos method. + */ + void testCtxt_startpos(); + + + /** + * Test of the starttoken method. + */ + void testStarttoken(); + + + /** + * Test of the word_distance method. + */ + void testWord_distance(); + + + /** + * Test of the distance method. + */ + void testDistance(); + + + /** + * Test of the elem_store_sz method. + */ + void testElem_store_sz(); + + + /** + * Test of the elems method. + */ + void testElems(); + + + /** + * Test of the distance method. + */ + void testDistance1(); + + + /** + * Test of the set_valid method. + */ + void testSet_valid(); + + + /************************************************************************* + * Test administration methods + *************************************************************************/ + + /** + * Set up common stuff for all test methods. + * This method is called immediately before each test method is called + */ + bool setUp(); + + /** + * Tear down common stuff for all test methods. + * This method is called immediately after each test method is called + */ + void tearDown(); + + typedef void(MatchCandidateTest::* tst_method_ptr) (); + typedef std::map<std::string, tst_method_ptr> MethodContainer; + MethodContainer test_methods_; + void init(); +protected: + + /** + * Since we are running within Emacs, the default behavior of + * print_progress which includes backspace does not work. + * We'll use a single '.' instead. + */ + virtual void print_progress() { *m_osptr << '.' << std::flush; } + +public: + + MatchCandidateTest(); + ~MatchCandidateTest() {} + + /************************************************************************* + * main entry points + *************************************************************************/ + void Run(MethodContainer::iterator &itr); + virtual void Run(); + void Run(const char *method); + void Run(int argc, char* argv[]); +}; + + +// Local Variables: +// mode:c++ +// End: diff --git a/juniper/src/test/mcandTestApp.cpp b/juniper/src/test/mcandTestApp.cpp new file mode 100644 index 00000000000..7bbc9681d21 --- /dev/null +++ b/juniper/src/test/mcandTestApp.cpp @@ -0,0 +1,42 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * Definition and implementation of the application for running unit tests + * for the MatchCandidate class in isolation. + * + * @file mcandTestApp.cpp + * + * @author Knut Omang + * + * @date Created 27 Feb 2003 + * + * $Id$ + * + * <pre> + * Copyright (c) : 2003 Fast Search & Transfer ASA + * ALL RIGHTS RESERVED + * </pre> + ****************************************************************************/ +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("mcandTest"); +#include "mcandTest.h" +#include "testenv.h" + +/** + * The MatchCandidateTestApp class is the main routine for running the unit + * tests for the MatchCandidate class in isolation. + * + * @sa MatchCandidate @author Knut Omang + */ +class MatchCandidateTestApp : public FastOS_Application { +public: + virtual int Main() { + juniper::TestEnv te(this, "../rpclient/testclient.rc"); + MatchCandidateTest test; + test.SetStream(&std::cout); + test.Run(_argc, _argv); + return (int)test.Report(); + } +}; + +FASTOS_MAIN(MatchCandidateTestApp); diff --git a/juniper/src/test/partialutf8.input.utf8 b/juniper/src/test/partialutf8.input.utf8 new file mode 100644 index 00000000000..df25fbb8c60 --- /dev/null +++ b/juniper/src/test/partialutf8.input.utf8 @@ -0,0 +1 @@ +補充拉∼看樣åä½ çŸ¥é“有哪些é…件阿 襪å那些 總共有6個é¡è‰²é‚„有ä¿è·å¥—http://www.fnac.com.tw/searchresults.aspx?kind=1&txtsearch=ipod&productid=23550703這是法雅客的網站 å¯ä»¥åƒè€ƒ===================================================這是ï½ï½ï½ï½Œï½…網站上完整的é…件淡 很多å°ç£è²·ä¸åˆ°å–”belkin 錄音è£ç½®ç”¨ belkin voice recorder 錄音è£ç½®ï¼Œå°‡æ‚¨çš„語音ç†è¨˜éŒ„進 ipod;您å¯ä»¥ç”¨å®ƒéŒ„製備忘錄ã€æœƒè°è¨˜éŒ„ã€è¨ªè«‡éŽç¨‹ç‰ç‰ï¼Œä¸¦å°‡å…§å®¹å„²å˜åœ¨ ipod ä¸Šï¼Œé‚„èƒ½åŠ è¨»æ™‚é–“å’Œæ—¥æœŸï¼Œä»¥ä¾¿æ—¥å¾ŒæŸ¥æ‰¾ã€‚ï¼ˆè«‹æ³¨æ„:本產å“與 ipod mini 並ä¸ç›¸å®¹ï¼‰ ipod 線控è£ç½®èˆ‡è€³æ©Ÿæœ‰äº†ç·šæŽ§è£ç½®ã€å†åŠ 上一組é¡å¤–çš„è€³æ©Ÿï¼Œåœ¨è·¯ä¸Šè½ ipod 就更方便了。本產å“å¯ä»¥æé…具有 dock é€£æŽ¥åŸ çš„ ipodã€ä»¥åŠ ipod mini 使用。 belkin 讀å¡æ©Ÿä¸‹æ¬¡æ¸¡å‡çš„時候,您å¯ä»¥ä¸å¿…å†å¹«æ•¸ä½ç›¸æ©Ÿå¸¶ä¸€å¤§å †è¨˜æ†¶å¡äº†ã€‚當您的記憶å¡æ‹æ»¿ç…§ç‰‡çš„時候,åªè¦ä½¿ç”¨æ–¹ä¾¿çš„ belkin media reader 讀å¡æ©Ÿï¼Œå°±å¯ä»¥å°‡æ•¸ä½ç…§ç‰‡ç›´æŽ¥å˜é€² ipod。(請注æ„:本產å“與 ipod mini 並ä¸ç›¸å®¹ï¼‰ belkin é›»æ± çµ„å¿…é ˆé›¢é–‹é›»è…¦å¥½å¹¾å¤©å—Žï¼Ÿæ‚¨å¯ä»¥é¸è³¼ belkin å…¬å¸å‡ºå“çš„å‚™ç”¨é›»æ± çµ„ï¼Œè®“æ‚¨å¯ä»¥ä½¿ç”¨ 4 個標準的 3 è™Ÿé›»æ± è®“ ipod 連續æ’é€éŸ³æ¨‚ 20 å°æ™‚。 ipod 攜帶套這個特製的攜帶套å¯ä»¥ä¿è·æ‚¨çš„ ipod ä¸å—外物碰撞,而且隨身攜帶更方便。您å¯ä»¥å°‡å®ƒå¤¾åœ¨è…°å¸¶ä¸Šã€æ‰‹æ袋上ã€ç”šè‡³èƒŒåŒ…上。多國電æºè½‰æŽ¥å¥—件有了這個é…件,您就å¯ä»¥å®‰å¿ƒæŠŠ ipod 帶到全世界任何地方。它包å«äº† 6 種ä¸åŒå½¢ç‹€çš„é›»æºæ’é ,é©ç”¨æ–¼å…¨ä¸–ç•Œå„地的電æºæ’座。 最é©åˆåœ¨å®¶ä½¿ç”¨çš„ ipod é…件 ipod dock 或 ipod mini dock為您家或辦公室å†å¤šè²·ä¸€å€‹ ipod dock 轉接座。這個轉接座上有立體è²ç·šè·¯è¼¸å‡ºï¼Œèƒ½é€£æŽ¥æ‚¨çš„ç«‹é«”è²æšè²å™¨ï¼Œè®“ ipod 變æˆä¸€çµ„節çœç©ºé–“的立體音響。 dock é€£æŽ¥åŸ è‡³ usb 2.0 + firewire 訊號線(pc é©ç”¨ï¼‰windows 專用的這款 dock 至 usb 2.0 + firewire 訊號線讓 ipod 能é€éŽ usb 2.0* 進行資料åŒæ¥ï¼Œä¸¦ä¸”以 firewire 介é¢å……電。(請注æ„:本產å“與 ipod mini 並ä¸ç›¸å®¹ï¼‰ jbl creature ii æšè²å™¨å°‡æ‚¨çš„ ipod 變æˆä¸€éƒ¨å®¶ç”¨éŸ³éŸ¿ã€‚é€éŽç°¡å–®çš„å³æ’å³ç”¨è¨å®šã€ä»¥åŠä¾¿åˆ©çš„觸控開關,creature 能將您的多媒體音效帶往å¦ä¸€å€‹ç©ºé–“。新版 ipod 專用 navipod 紅外線é™æŽ§å™¨é€™æ¬¾é™æŽ§å™¨æœ‰ 5 個按鈕,å¦å¤–有一個接收器直接æ’在 ipod é ‚ç«¯ï¼›é€éŽé™æŽ§å™¨ï¼Œæ‚¨å¯ä»¥åœ¨æˆ¿é–“çš„å¦å¤–一端æ“作 ipod。ipod 立體音響連接套件想讓您的音樂資料庫和æ’放列表æˆç‚ºä¸‹ä¸€æ¬¡æ´¾å°çš„焦點嗎?蘋果的 apple stereo connection kit 立體音響連接套件,å†åŠ 上 monster cable 訊號線,就是最好的ç”案。(本產å“å¿…é ˆæé…具有 dock é€£æŽ¥åŸ çš„ ipod 機種使用)monster isplitter這款由 monster cable 出å“çš„ ipod è¿·ä½ ç«‹é«”è² y 形分岔接é ,能將 2 çµ„è¿·ä½ ç«‹é«”è²è€³æ©Ÿæˆ–æšè²å™¨é€£æŽ¥åœ¨ä¸€å€‹æŽ¥å£ä¸Šï¼Œè®“您å¯ä»¥å’Œæœ‹å‹åˆ†äº«éŸ³æ¨‚。 給駕車奔馳一æ—çš„ ipod 玩家 griffin itrip fm æ’放機ç¾åœ¨ï¼Œæ‚¨çš„ ipod å¯ä»¥é€éŽæ‚¨è»Šä¸Šçš„ fm 收音機æ’放音樂了。 itrip fm transmitter æ’放機å¯ä»¥æé…具有 dock é€£æŽ¥åŸ çš„æ–°æ¬¾ ipod 使用,而且åªæœƒè€—費極為少é‡çš„ ipod 電力,也ä¸éœ€è¦å®‰è£é›»æ± 。 sony 錄音帶轉接器將 ipod 連接到您的汽車音響上;åªè¦å°‡è½‰æŽ¥å™¨æ’進汽車音響上的錄音座,您就å¯ä»¥ç«‹å³äº«å— ipod ä¸Šçš„éŸ³æ¨‚ã€‚ï¼ˆå¿…é ˆæé…能橫å‘æ’入錄音帶的汽車音響?/p> belkin 車用點煙器轉接è£ç½®å¯é¸è³¼çš„ belkin 車用點煙器轉接è£ç½®å…§å«ä¸€çµ„ 3.5mm 訊號輸出線,å¯ä»¥ç›´æŽ¥æ’上汽車音響的訊號輸入æ’å£ï¼Œè®“您在車上也å¯ä»¥æ¬£è³ž ipod æ’放的音樂。icarplay wireless - fm æ’放機monster icarplay wireless - fm transmitter æ’放機,能在為 ipod 充電的åŒæ™‚,é€éŽæ±½è»Šä¸Šçš„ fm 收音機æ’放 ipod ä¸Šçš„éŸ³æ¨‚ï¼›å¿…é ˆæé…有 dock é€£æŽ¥åŸ çš„ ipod 使用。monster icase 旅行組為您的 ipod å’Œé…件準備一套完整的儲å˜å’Œä¿è·ç›’。這套旅行組包括 monster icarcharger 充電器和 monster isplitter è¿·ä½ ç«‹é«”è² y 形分岔接é ï¼›å¯æé…有 dock é€£æŽ¥åŸ çš„ ipod 使用。 belkin tunedok 車用放置架belkin tunedok 讓您開車時輕鬆帶著 ipod è½éŸ³æ¨‚。把您的 ipod 放在 tunedok 上,讓它的 air-grip å¸ç›¤å°‡ ipod 穩當固定在é©ç•¶çš„ä½ç½®ï¼Œä»¥ä¾¿æ‚¨éš¨æ‰‹å–用。å¯æé…所有機型的 ipod。
\ No newline at end of file diff --git a/juniper/src/test/queryparserTest.cpp b/juniper/src/test/queryparserTest.cpp new file mode 100644 index 00000000000..525e1a37189 --- /dev/null +++ b/juniper/src/test/queryparserTest.cpp @@ -0,0 +1,263 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * Implementation of the automated unit test class for the QueryParser + * class. + * + * @file queryparserTest.cpp + * + * @author Knut Omang + * + * @date Created 24 Feb 2003 + * + * $Id$ + * + * <pre> + * Copyright (c) : 2003 Fast Search & Transfer ASA + * ALL RIGHTS RESERVED + * </pre> + ***************************************************************************/ +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(""); +#include "queryparserTest.h" +#include "fakerewriter.h" + + +// Comment out cerr below to ignore unimplemented tests +#define NOTEST(name) \ +std::cerr << std::endl << __FILE__ << ':' << __LINE__ << ": " \ + << "No test for method '" << (name) << "'" << std::endl; + +/************************************************************************* + * Test methods + * + * This section contains boolean methods for testing each public method + * in the class being tested + *************************************************************************/ + +/** + * Test of the UsefulIndex method. + */ +void QueryParserTest::testUsefulIndex() { +// NOTEST("UsefulIndex"); +} + + +/** + * Test of the Index method (also implicit test of integration with + * expander interface) + */ +void QueryParserTest::testIndex() { + FakeRewriter fexp; + // Add as rewriter for query and not for document + juniper::_Juniper->AddRewriter("ourindex", &fexp, true, false); + juniper::QueryParser p("AND(ourindex:cake,myindex:eat)"); + _test(p.ParseError() == 0); + if (p.ParseError()) return; + + juniper::QueryHandle qh(p, NULL, juniper::_Juniper->getModifier()); + std::string stk; + qh.MatchObj(0)->Query()->Dump(stk); + _test_equal(stk, "Node<a:2>[Node<a:4>[cake0:100,cake1:100,cake2:100,cake3:100],eat:100]"); + + std::string stk1; + qh.MatchObj(6)->Query()->Dump(stk1); + _test_equal(stk1, "Node<a:2>[cake:100,eat:100]"); + + // Then let's add a reducer rewriter (should not affect anything..) + juniper::_Juniper->AddRewriter("myindex", &fexp, false, true); + std::string stk2; + qh.MatchObj(0)->Query()->Dump(stk2); + _test_equal(stk2, "Node<a:2>[Node<a:4>[cake0:100,cake1:100,cake2:100,cake3:100],eat:100]"); +} + + +/** + * Test of the Creator method. + */ +void QueryParserTest::testCreator() { +// NOTEST("Creator"); +} + + +/** + * Test of the Weight method. + */ +void QueryParserTest::testWeight() { + { + // Complex nested query (bug example from datasearch 4.0) + juniper::QueryParser p2("OR(ANDNOT(AND(a,b),c),OR(d,e))"); + _test(p2.ParseError() == 0); + + juniper::QueryHandle qh2(p2, NULL, juniper::_Juniper->getModifier()); + std::string stk2; + qh2.MatchObj(0)->Query()->Dump(stk2); + _test_equal(stk2, "Node<a:2>[Node<a:2>[a:100,b:100],Node<a:2>[d:100,e:100]]"); + } + { + // Another complex nested query (bug example from datasearch 4.0) + juniper::QueryParser p2("OR(ANDNOT(RANK(a,OR(b,c)),d),OR(e,f))"); + _test(p2.ParseError() == 0); + + juniper::QueryHandle qh2(p2, NULL, juniper::_Juniper->getModifier()); + std::string stk2; + qh2.MatchObj(0)->Query()->Dump(stk2); + _test_equal(stk2, "Node<a:2>[a:100,Node<a:2>[e:100,f:100]]"); + } +} + + +/** + * Test of the Traverse method. + */ +void QueryParserTest::testTraverse() { + // simple OR query + juniper::QueryParser p1("OR(a,b,c)"); + _test(p1.ParseError() == 0); + + juniper::QueryHandle qh1(p1, NULL, juniper::_Juniper->getModifier()); + std::string stk1; + qh1.MatchObj(0)->Query()->Dump(stk1); + _test(strcmp(stk1.c_str(),"Node<a:3>[a:100,b:100,c:100]") == 0); + + { + // Complex query with phrases + juniper::QueryParser p2("OR(AND(xx,yy),PHRASE(junip*,proximity),PHRASE(data,search))"); + _test(p2.ParseError() == 0); + + juniper::QueryHandle qh2(p2, NULL, juniper::_Juniper->getModifier()); + std::string stk2; + qh2.MatchObj(0)->Query()->Dump(stk2); + _test(strcmp(stk2.c_str(), + "Node<a:3,v>[" + "Node<a:2>[xx:100,yy:100]," + "Node<a:2,o,l:0,e,v,c>[junip*:100,proximity:100]," + "Node<a:2,o,l:0,e,v,c>[data:100,search:100]]") == 0); + } + + { + // Triggering bug ticket 5690 Dev Data Search: + juniper::QueryParser p2("ANDNOT(ANDNOT(AND(cmsm,OR(cidus,ntus)," + "OR(jtft,jtct,jtin,jtfp)," + "OR(PHRASE(strategic,marketing)," + "PHRASE(marketing,strategy))),a))"); + _test(p2.ParseError() == 0); + + juniper::QueryHandle qh2(p2, NULL, juniper::_Juniper->getModifier()); + std::string stk2; + qh2.MatchObj(0)->Query()->Dump(stk2); + std::string s(stk2.c_str()); + _test_equal(s, + "Node<a:4,v>[cmsm:100,Node<a:2>[cidus:100,ntus:100]," + "Node<a:4>[jtft:100,jtct:100,jtin:100,jtfp:100]," + "Node<a:2,v>[Node<a:2,o,l:0,e,v,c>[strategic:100,marketing:100]," + "Node<a:2,o,l:0,e,v,c>[marketing:100,strategy:100]]]"); + } + + // Query with NEAR and WITHIN + juniper::QueryParser p3("OR(NEAR/1(linux,kernel),WITHIN/3(linus,torvalds))"); + _test(p3.ParseError() == 0); + + juniper::QueryHandle qh3(p3, NULL, juniper::_Juniper->getModifier()); + std::string stk3; + qh3.MatchObj(0)->Query()->Dump(stk3); + _test(strcmp(stk3.c_str(), + "Node<a:2,v>[" + "Node<a:2,l:1,v,c>[linux:100,kernel:100]," + "Node<a:2,o,l:3,v,c>[linus:100,torvalds:100]]") == 0); + + // Query with ONEAR + juniper::QueryParser p4("OR(ONEAR/3(linus,torvalds))"); + _test(p4.ParseError() == 0); + + juniper::QueryHandle qh4(p4, NULL, juniper::_Juniper->getModifier()); + std::string stk4; + qh4.MatchObj(0)->Query()->Dump(stk4); + _test(strcmp(stk4.c_str(), + "Node<a:2,o,l:3,v,c>[linus:100,torvalds:100]") == 0); +} + + +/************************************************************************* + * Test administration methods + *************************************************************************/ + +/** + * Set up common stuff for all test methods. + * This method is called immediately before each test method is called + */ +bool QueryParserTest::setUp() { + return true; +} + +/** + * Tear down common stuff for all test methods. + * This method is called immediately after each test method is called + */ +void QueryParserTest::tearDown() { +} + +/** + * Build up a map with all test methods + */ +void QueryParserTest::init() { + test_methods_["testUsefulIndex"] = + &QueryParserTest::testUsefulIndex; + test_methods_["testIndex"] = + &QueryParserTest::testIndex; + test_methods_["testCreator"] = + &QueryParserTest::testCreator; + test_methods_["testWeight"] = + &QueryParserTest::testWeight; + test_methods_["testTraverse"] = + &QueryParserTest::testTraverse; +} + +/************************************************************************* + * main entry points + *************************************************************************/ + + +void QueryParserTest::Run(MethodContainer::iterator &itr) { + try { + if (setUp()) { + (this->*itr->second)(); + tearDown(); + } + } catch (...) { + _fail("Got unknown exception in test method " + itr->first); + } +} + +void QueryParserTest::Run(const char* method) { + MethodContainer::iterator pos(test_methods_.find(method)); + if (pos != test_methods_.end()) { + Run(pos); + } else { + std::cerr << "ERROR: No test method named \"" + << method << "\"" << std::endl; + _fail("No such method"); + } +} + +void QueryParserTest::Run() { + for (MethodContainer::iterator itr(test_methods_.begin()); + itr != test_methods_.end(); + ++itr) + Run(itr); +} + +/* + * Parse runtime arguments before running. + * If the -m METHOD parameter is given, run only that method + */ +void QueryParserTest::Run(int argc, char* argv[]) { + for (int i = 1; i < argc; ++i) { + if (strcmp(argv[i], "-m") == 0 && argc > i + 1) + { + Run(argv[++i]); + return; + } + } + Run(); +} diff --git a/juniper/src/test/queryparserTest.h b/juniper/src/test/queryparserTest.h new file mode 100644 index 00000000000..1a13e4d3cc1 --- /dev/null +++ b/juniper/src/test/queryparserTest.h @@ -0,0 +1,119 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * Definition of the automated unit test class for the QueryParser class. + * + * @file queryparserTest.h + * + * @author Knut Omang + * + * @date Created 24 Feb 2003 + * + * $Id$ + * + * <pre> + * Copyright (c) : 2003 Fast Search & Transfer ASA + * ALL RIGHTS RESERVED + * </pre> + ***************************************************************************/ +#pragma once + +#include <map> +#include <vespa/fastlib/testsuite/test.h> +#include "testenv.h" +#include <vespa/juniper/queryparser.h> +#include <vespa/juniper/rewriter.h> + +/** + * The QueryParserTest class holds + * the unit tests for the QueryParser class. + * + * @sa QueryParser + * @author Knut Omang + */ +class QueryParserTest : public Test { + + /************************************************************************* + * Test methods + * + * This section contains boolean methods for testing each public method + * in the class ing tested + *************************************************************************/ + + /** + * Test of the UsefulIndex method. + */ + void testUsefulIndex(); + + + /** + * Test of the Index method. + */ + void testIndex(); + + + /** + * Test of the Creator method. + */ + void testCreator(); + + + /** + * Test of the Weight method. + */ + void testWeight(); + + + /** + * Test of the Traverse method. + */ + void testTraverse(); + + + /************************************************************************* + * Test administration methods + *************************************************************************/ + + /** + * Set up common stuff for all test methods. + * This method is called immediately before each test method is called + */ + bool setUp(); + + /** + * Tear down common stuff for all test methods. + * This method is called immediately after each test method is called + */ + void tearDown(); + + typedef void(QueryParserTest::* tst_method_ptr) (); + typedef std::map<std::string, tst_method_ptr> MethodContainer; + MethodContainer test_methods_; + void init(); + +protected: + + /** + * Since we are running within Emacs, the default behavior of + * print_progress which includes backspace does not work. + * We'll use a single '.' instead. + */ + virtual void print_progress() { *m_osptr << '.' << std::flush; } + +public: + + QueryParserTest() : Test("QueryParser"), test_methods_() { init(); } + ~QueryParserTest() {} + + /************************************************************************* + * main entry points + *************************************************************************/ + void Run(MethodContainer::iterator &itr); + virtual void Run(); + void Run(const char *method); + void Run(int argc, char* argv[]); +}; + + +// Local Variables: +// mode:c++ +// End: diff --git a/juniper/src/test/queryparserTestApp.cpp b/juniper/src/test/queryparserTestApp.cpp new file mode 100644 index 00000000000..f7c8bce4afa --- /dev/null +++ b/juniper/src/test/queryparserTestApp.cpp @@ -0,0 +1,42 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * Definition and implementation of the application for running unit tests + * for the QueryParser class in isolation. + * + * @file queryparserTestApp.cpp + * + * @author Knut Omang + * + * @date Created 24 Feb 2003 + * + * $Id$ + * + * <pre> + * Copyright (c) : 2003 Fast Search & Transfer ASA + * ALL RIGHTS RESERVED + * </pre> + ****************************************************************************/ +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("queryparserTest"); +#include "queryparserTest.h" +#include "testenv.h" + +/** + * The QueryParserTestApp class is the main routine for running the unit + * tests for the QueryParser class in isolation. + * + * @sa QueryParser @author Knut Omang + */ +class QueryParserTestApp : public FastOS_Application { +public: + virtual int Main() { + juniper::TestEnv te(this, "../rpclient/testclient.rc"); + QueryParserTest test; + test.SetStream(&std::cout); + test.Run(_argc, _argv); + return (int)test.Report(); + } +}; + +FASTOS_MAIN(QueryParserTestApp); diff --git a/juniper/src/test/queryvisitor_test.cpp b/juniper/src/test/queryvisitor_test.cpp new file mode 100644 index 00000000000..cc2d224e7d0 --- /dev/null +++ b/juniper/src/test/queryvisitor_test.cpp @@ -0,0 +1,75 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <memory> +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("queryvisitor_test"); +#include <vespa/vespalib/testkit/testapp.h> + +#include <vespa/juniper/queryhandle.h> +#include <vespa/juniper/queryvisitor.h> +#include <vespa/vespalib/stllike/string.h> + +using namespace juniper; + +class MyQuery : public juniper::IQuery +{ +private: + vespalib::string _term; + +public: + MyQuery(const vespalib::string &term) : _term(term) {} + + virtual bool Traverse(IQueryVisitor* v) const override { + v->VisitKeyword(nullptr, _term.c_str(), _term.size()); + return true; + } + virtual int Weight(const QueryItem*) const override { + return 0; + } + virtual ItemCreator Creator(const QueryItem*) const override { + return ItemCreator::CREA_ORIG; + } + virtual const char* Index(const QueryItem*, size_t*) const override { + return "my_index"; + } + virtual bool UsefulIndex(const QueryItem*) const override { + return true; + } +}; + +struct Fixture +{ + MyQuery query; + QueryModifier modifier; + QueryHandle handle; + QueryVisitor visitor; + Fixture(const vespalib::string &term) + : query(term), + modifier(), + handle(query, "", modifier), + visitor(query, &handle, modifier) + {} +}; + +TEST_F("require that terms are picked up by the query visitor", Fixture("my_term")) +{ + auto query = std::unique_ptr<QueryExpr>(f.visitor.GetQuery()); + ASSERT_TRUE(query != nullptr); + QueryNode *node = query->AsNode(); + ASSERT_TRUE(node != nullptr); + EXPECT_EQUAL(1, node->_arity); + QueryTerm *term = node->_children[0]->AsTerm(); + ASSERT_TRUE(term != nullptr); + EXPECT_EQUAL("my_term", vespalib::string(term->term())); +} + +TEST_F("require that empty terms are ignored by the query visitor", Fixture("")) +{ + QueryExpr *query = f.visitor.GetQuery(); + ASSERT_TRUE(query == nullptr); +} + +TEST_MAIN() +{ + TEST_RUN_ALL(); +} diff --git a/juniper/src/test/testenv.cpp b/juniper/src/test/testenv.cpp new file mode 100644 index 00000000000..40f35072948 --- /dev/null +++ b/juniper/src/test/testenv.cpp @@ -0,0 +1,124 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* Setup and parameter parsing for static Juniper environment to reuse + * within test framework + */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(""); +#include "testenv.h" +#include <vespa/juniper/propreader.h> + + +namespace juniper +{ + +bool color_highlight = false; +// Easy access in tests.. +Config* TestConfig; +Juniper * _Juniper; + + +TestEnv::TestEnv(FastOS_Application* app, const char* propfile) : + _props(), _config(), _juniper(), _wordFolder() +{ + char c; + const char* oarg = NULL; + int oind = 1; + + while ((c = app->GetOpt("d:hcm:", oarg, oind)) != EOF) + { + switch (c) + { + case 'd': +#ifdef FASTOS_DEBUG + debug_level = strtol(oarg, NULL, 0); +#else + fprintf(stderr, "This version of Juniper compiled without debug\n"); +#endif + break; + case 'c': + color_highlight = true; + break; + case 'm': + // option handled by test framework + break; + case 'h': + default: + Usage(app->_argv[0]); + return; + } + } + + int expected_args = 0; + + if (app->_argc - oind < expected_args) + { + Usage(app->_argv[0]); + return; + } + + _props.reset(new PropReader(propfile)); + + if (color_highlight) + { + _props->UpdateProperty("juniper.dynsum.highlight_on", "\\1b[1;31m"); + _props->UpdateProperty("juniper.dynsum.highlight_off", "\\1b[0m"); + } + + _juniper.reset(new Juniper(_props.get(), &_wordFolder)); + _Juniper = _juniper.get(); + _config = _juniper->CreateConfig(); + TestConfig = _config.get(); +} + +TestEnv::~TestEnv() +{ +} + +void TestEnv::Usage(char* s) +{ + fprintf(stderr, "Usage: %s [options]\n", s); + fprintf(stderr, "Available options:\n"); + fprintf(stderr, " -d<debugmask>: Turn on debugging\n"); + fprintf(stderr, " -h: This help\n"); +} + + +TestQuery::TestQuery(const char* qexp, const char* options) : + _qparser(qexp), + _qhandle(_qparser, options, _Juniper->getModifier()) +{ } + + +PropertyMap::PropertyMap() + : _map() +{ +} + + +PropertyMap::~PropertyMap() +{ +} + + +PropertyMap & +PropertyMap::set(const char *name, const char *value) +{ + _map[std::string(name)] = std::string(value); + return *this; +} + + +const char * +PropertyMap::GetProperty(const char* name, const char* def) +{ + std::map<std::string, std::string>::iterator res = _map.find(std::string(name)); + if (res != _map.end()) { + return res->second.c_str(); + } + return def; +} + + +} // end namespace juniper diff --git a/juniper/src/test/testenv.h b/juniper/src/test/testenv.h new file mode 100644 index 00000000000..e7dde4da552 --- /dev/null +++ b/juniper/src/test/testenv.h @@ -0,0 +1,71 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +/* Include most of the stuff that we might need */ + +#include <vespa/fastos/fastos.h> +#include <vespa/fastlib/text/unicodeutil.h> +#include <vespa/fastlib/text/normwordfolder.h> +#include <vespa/juniper/query.h> +#include <vespa/juniper/juniperdebug.h> +#include <vespa/juniper/rpinterface.h> +#include <vespa/juniper/queryhandle.h> +#include <vespa/juniper/queryparser.h> +#include <vespa/juniper/queryvisitor.h> +#include <vespa/juniper/result.h> +#include <vespa/juniper/config.h> +#include <vespa/juniper/queryparser.h> +#include <vespa/juniper/matchobject.h> +#include <vespa/juniper/SummaryConfig.h> +#include <vespa/juniper/Matcher.h> +#include <vespa/juniper/mcand.h> +#include <vespa/juniper/propreader.h> +#include <vespa/juniper/specialtokenregistry.h> + +namespace juniper +{ + +class TestEnv +{ +public: + TestEnv(FastOS_Application* app, const char* propfile); + virtual ~TestEnv(); + void Usage(char* s); +private: + std::unique_ptr<PropReader> _props; + std::unique_ptr<Config> _config; + std::unique_ptr<Juniper> _juniper; + Fast_NormalizeWordFolder _wordFolder; + TestEnv(const TestEnv&); + TestEnv& operator=(const TestEnv&); +}; + + +class TestQuery +{ +public: + TestQuery(const char* qexp, const char* options = NULL); + QueryParser _qparser; + QueryHandle _qhandle; +}; + + +class PropertyMap : public IJuniperProperties +{ +private: + std::map<std::string, std::string> _map; +public: + PropertyMap(); + virtual ~PropertyMap(); + PropertyMap &set(const char *name, const char *value); + virtual const char* GetProperty(const char* name, const char* def = NULL); +}; + + +extern Config* TestConfig; +extern Juniper * _Juniper; + +} // end namespace juniper + +typedef juniper::TestQuery TestQuery; + diff --git a/juniper/src/testproject.el b/juniper/src/testproject.el new file mode 100644 index 00000000000..56cc68fe61d --- /dev/null +++ b/juniper/src/testproject.el @@ -0,0 +1,97 @@ +;; testproject.el + +;; Local configurations for the cpptest Emacs unit-test +;; framework. This is just an example of typical variables that one +;; usually uses This file should be located in the same directory as +;; the class(es) you want to test. + +;; $Revision: 1.2 $ $Date: 2003-02-27 12:32:24 $ +;; Author: Nils Sandøy <nils.sandoy@fast.no> + +;; Just a message to show that this file is beeing read. Look for this +;; in the *Messages* buffer. +(message "Setting local test configuration for the module") + +;; Don't use an underscore based naming scheme +;; classes and method names will Upcase each word instead +(setq cppt-use-underscore-p nil) + +;; Use author in documentation. Set this value to nil if not +(setq cppt-doc-author-p "t") + +;; This is a subdirectory of the directory in which this file, along with +;; the source code to test, resides +(setq cppt-test-dir "test") + +;; Use this variable to include extra file in your test source, and +;; application files. Typically this will hold headers for log +;; functionality etc. +;; Example: (setq cppt-extra-source-includes "#include \"../Log.h\"") +(setq cppt-extra-source-includes "#include \"testenv.h\"") + +;; If the above source files are not part of a library, you will +;; probably have to include them in the fastos.project file. +;; Example: (setq cppt-extra-object-files '("../Log")) +(setq cppt-extra-object-files '("testenv")) + +;; If the source code does not have a fastos.project file with all +;; required libraries for linking an executable (typically the case +;; when the source is part of a library itself), then you should use +;; this variable to provide a list of libraries which will be appended +;; to the EXTERNALLIBS section for all applications in the +;; fastos.project file. +;; Example: (setq cppt-extra-libraries '("fast")) +(setq cppt-extra-libraries '("src/juniper")) +(setq cppt-extra-external-libraries '("fast")) + +;; Include source file in test executables. +;; Set this to nil if you are testing part of a library +(setq cppt-include-source-p "t") + + +;; If your initialisation code below requires special parameters for +;; running the test executables, add them here +;; Example: (setq cppt-test-parameters "--test-mode") +(setq cppt-test-parameters "") + +;; If you support a special debug mode, which is executed through the +;; cppt-suite-debug or cppt-run-test-debug methods, then you should +;; add the parameter for identifying this here +;; Example: (setq cppt-test-dbflags "-d") +(setq cppt-test-dbflags "") + +;; If you support logging etc, you should include code here for +;; insitializing this as part of the Main body of the test application +;; Example: +;; Add intialization code that turns on logging, and logs to stderr in debug +;; mode +;; (setq cppt-application-init-code +;; "RTLogDistributor::GetInstance().RegisterDestination( +;; new Fast_FileLogger(\"CLASSTest.log\"), FLOG_ALL); +;; for (int i=0; i < argc; ++i) { +;; if (strcmp(argv[i], \"-d\") == 0) { +;; // Turn on debug mode (log to stderr) +;; RTLogDistributor::GetInstance().RegisterDestination( +;; new Fast_FileLogger(stderr), FLOG_ALL); +;; LOG_DBG(\"Running in debug mode\"); +;; } +;; }") +(setq cppt-application-init-code "") + +;; Pretty much the same as the application init code, but this is used +;; for the Main method of the test suite. +;; Example: +;; Add intialization code that turns on logging, and logs to stderr in debug +;; mode +;; (setq cppt-suite-init-code +;; "RTLogDistributor::GetInstance().RegisterDestination( +;; new Fast_FileLogger(\"SUITETest.log\"), FLOG_ALL); +;; for (int i=0; i < argc; ++i) { +;; if (strcmp(argv[i], \"-d\") == 0) { +;; // Turn on debug mode (log to stderr) +;; RTLogDistributor::GetInstance().RegisterDestination( +;; new Fast_FileLogger(stderr), FLOG_ALL); +;; LOG_DBG(\"Running in debug mode\"); +;; } +;; }") +(setq cppt-suite-init-code "") diff --git a/juniper/src/vespa/juniper/.gitignore b/juniper/src/vespa/juniper/.gitignore new file mode 100644 index 00000000000..d4d35b71c0f --- /dev/null +++ b/juniper/src/vespa/juniper/.gitignore @@ -0,0 +1,8 @@ +*.So +*.dsp +*.ilk +*.lib +.depend +Makefile +juniper.tag +libjuniper.so.5.1 diff --git a/juniper/src/vespa/juniper/CMakeLists.txt b/juniper/src/vespa/juniper/CMakeLists.txt new file mode 100644 index 00000000000..97cc8e3a1cc --- /dev/null +++ b/juniper/src/vespa/juniper/CMakeLists.txt @@ -0,0 +1,29 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(juniper + SOURCES + Matcher.cpp + sumdesc.cpp + mcand.cpp + keyocc.cpp + juniperparams.cpp + SummaryConfig.cpp + tokenizer.cpp + propreader.cpp + stringmap.cpp + rpinterface.cpp + dpinterface.cpp + querynode.cpp + queryvisitor.cpp + queryhandle.cpp + matchobject.cpp + result.cpp + config.cpp + matchelem.cpp + queryparser.cpp + querymodifier.cpp + expcache.cpp + reducematcher.cpp + specialtokenregistry.cpp + INSTALL lib64 + DEPENDS +) diff --git a/juniper/src/vespa/juniper/IJuniperProperties.h b/juniper/src/vespa/juniper/IJuniperProperties.h new file mode 100644 index 00000000000..d52b1e69058 --- /dev/null +++ b/juniper/src/vespa/juniper/IJuniperProperties.h @@ -0,0 +1,26 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/fastos/fastos.h> + +/** @file IJuniperProperties.h Defining the IJuniperProperties class */ + +/** An abstract interface to configuration file settings used by Juniper to process + * it's preconfigured parameter sets. + */ +class IJuniperProperties +{ +public: + /** Get the value of a property + * @param name The textual representation of the property + * assumed to be on the form class.juniperpart.variable, such as for example + * juniper.dynsum.length + * @param def A default value for the property if not found in configuration + * @return The value of the property or @param def if no such property is set + */ + virtual const char* GetProperty(const char* name, const char* def = NULL) = 0; + + virtual ~IJuniperProperties() {}; +}; + + diff --git a/juniper/src/vespa/juniper/ITokenProcessor.h b/juniper/src/vespa/juniper/ITokenProcessor.h new file mode 100644 index 00000000000..2bfe42ebb51 --- /dev/null +++ b/juniper/src/vespa/juniper/ITokenProcessor.h @@ -0,0 +1,63 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once +#include <vespa/fastos/fastos.h> +#include <vespa/fastlib/text/unicodeutil.h> + +/** Implement this interface for objects that intend to serve as processing stages in + * a Juniper token processor pipeline. + */ + +class ITokenProcessor +{ +public: + /** Token definition. Note that not all information might be available at all stages. + * As a minimum token, bytepos, wordpos and bytelen should have been set. + * Other fields should have been set to 0 and should be left untouched if not provided + * by the processor. + */ + struct Token + { + Token() : token(NULL), bytepos(0), charpos(0), wordpos(0), + bytelen(0), charlen(0), curlen(0) + {} + const ucs4_t* token; //!< a normalized UCS4 representation of the token + off_t bytepos; //!< Position in bytes from start of original text + off_t charpos; //!< Position in number of characters according to utf8 encoding + off_t wordpos; //!< Position in number of words + int bytelen; //!< Size in bytes of the original token as in the text + int charlen; //!< Size in number of utf8 characters + int curlen; //!< Size in ucs4_t of the token after conversions + + Token(const Token& other) : token(other.token), + bytepos(other.bytepos), + charpos(other.charpos), + wordpos(other.wordpos), + bytelen(other.bytelen), + charlen(other.charlen), + curlen(other.curlen) {} + Token& operator= (const Token& other) { + token = other.token; + bytepos = other.bytepos; + charpos = other.charpos; + wordpos = other.wordpos; + bytelen = other.bytelen; + charlen = other.charlen; + curlen = other.curlen; + return *this; + } + }; + + virtual ~ITokenProcessor() {} + + /** handle the next token + * @param token The token to process. + */ + virtual void handle_token(Token& token) = 0; + + /** handle the end of the text as a special, zero length token. + * @param token The token to process. + */ + virtual void handle_end(Token& token) = 0; +}; + + diff --git a/juniper/src/vespa/juniper/Matcher.cpp b/juniper/src/vespa/juniper/Matcher.cpp new file mode 100644 index 00000000000..2a9ab3e52eb --- /dev/null +++ b/juniper/src/vespa/juniper/Matcher.cpp @@ -0,0 +1,562 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.matcher"); +#include <algorithm> +#include <string> +#include "query.h" +#include "juniperdebug.h" +#include "sumdesc.h" +#include "Matcher.h" +#include "foreach_utils.h" +#include "SummaryConfig.h" +#include "querynode.h" +#include "mcand.h" +#include "matchobject.h" +#include "result.h" +#include "juniperparams.h" +#include "config.h" + +unsigned debug_level = 0; + +#define KEY_OCC_RESERVED 10 + +Matcher::Matcher(Result* result) : + _result(result), + _qhandle(result->_qhandle), + _mo(_qhandle->MatchObj(result->_langid)), + _match_iter(_mo, result), + _winsize(600), + _winsizeFallback(_winsize*10), + _max_match_candidates(1000), + _proximity_noconstraint_offset(PROXIMITYBOOST_NOCONSTRAINT_OFFSET), + _proximity_factor(1.0), + _need_complete_cnt(3), + _endpos(0), + _nontermcnt(_mo->NontermCount()), + _occ(), + _wrk_set(NULL), + _matches(), + _ctxt_start(0), + _log_mask(0), + _log_text("") +{ + _occ.reserve(KEY_OCC_RESERVED); + DocsumParams& dsp = _result->_config->_docsumparams; + _winsize = _result->WinSize(); + _winsizeFallback = static_cast<size_t>(_result->WinSizeFallbackMultiplier() * _winsize); + _max_match_candidates = _result->MaxMatchCandidates(); + _need_complete_cnt = dsp.MaxMatches(); + _wrk_set = new match_sequence[_nontermcnt]; + LOG(debug, "Matcher(): winsize(%zu), winsize_fallback(%zu), max_match_candidates(%zu), need_complete_cnt(%d)", + _winsize, _winsizeFallback, _max_match_candidates, _need_complete_cnt); + +} + +Matcher::~Matcher() +{ + reset_document(); + delete[] _wrk_set; +} + + +// Efficient object creation/deletion + +MatchCandidate* Matcher::NewCandidate(QueryExpr* query) +{ + typedef MatchElement * MatchElementP; + return new MatchCandidate(query, new MatchElementP[query->_arity], _ctxt_start); +} + + +MatchCandidate* Matcher::RefCandidate(MatchCandidate* m) +{ + if (!m) return NULL; + m->ref(); + if (LOG_WOULD_LOG(spam)) { + std::string s; m->dump(s); + LOG(spam, "RefCandidate: %s", s.c_str()); + } + return m; +} + + +void Matcher::DerefCandidate(MatchCandidate* m) +{ + if (!m) return; + if (LOG_WOULD_LOG(spam)) { + std::string s; m->dump(s); + LOG(spam, "DerefCandidate: %s", s.c_str()); + } + if (m->deref()) return; + // Dereference all the complex (MatchCandidate) children of m: + for (int i = 0; i < m->elem_store_sz(); i++) { + if (m->element[i]) + DerefCandidate(m->element[i]->Complex()); + } + delete m; +} + + +Matcher& Matcher::SetProximityFactor(float proximity_factor) +{ + if (proximity_factor != 1) { + LOG(debug, "Proximity factor %.1f", proximity_factor); + } + _proximity_factor = proximity_factor; + return *this; +} + + +void Matcher::reset_document() +{ + // Delete all our document specific data structures to reset to initial state: + LOG(debug, "Matcher: resetting document"); + flush_candidates(); + reset_matches(); + reset_occurrences(); + _endpos = 0; +} + +void Matcher::reset_matches() +{ + LOG(debug, "reset_matches"); + for (match_candidate_set::iterator it = _matches.begin(); it != _matches.end(); ++it) + DerefCandidate(*it); + _matches.clear(); + _ctxt_start = 0; +} + +void Matcher::reset_occurrences() +{ + delete_all(_occ); + _occ.clear(); +} + + +void Matcher::update_match(MatchCandidate* m) +{ + QueryNode* nexp = m->match()->_parent; + if (!nexp) { // root node of query + _matches.insert(m); + // Tag all terms + m->set_valid(); + } else { + // Add the parent candidate + MatchCandidate* nm = NewCandidate(nexp); + match_sequence& cs = _wrk_set[nexp->_node_idx]; + cs.push_back(nm); + + // Update the parent candidate work set + update_wrk_set(_wrk_set[nexp->_node_idx], m, m->match()); + + // This candidate was removed from it's wrk set but + // the ref is not forwarded to the matches list since it is an + // intermediate node.. + DerefCandidate(m); + } +} + + + +bool Matcher::add_occurrence(off_t pos, off_t tpos, size_t len) +{ + QueryTerm* mexp = _match_iter.current(); + + LOG(spam, "Match: %s(%ld)", mexp->term(), tpos); + + // Add new occurrence to sequence of all occurrences + key_occ_ptr k = new key_occ(mexp->term(), pos, tpos, len); + if (!k) return false; + + _occ.push_back(k); + + if (!(_need_complete_cnt > 0)) { + size_t nodeno; + // From the head of the sequences, remove any candidates that are + // "too old", eg. that is not complete within the winsize window + // and also trig further processing of complete matches: + for (nodeno = 0; nodeno < _nontermcnt; nodeno++) { + match_sequence& ws = _wrk_set[nodeno]; + for (match_sequence::iterator it = ws.begin(); it != ws.end();) { + MatchCandidate* m = (*it); + if ((k->startpos() - m->startpos()) < static_cast<int>(_winsize)) break; + it = ws.erase(it); // This moves the iterator forward + if (m->partial_ok()) + update_match(m); + else + DerefCandidate(m); + } + } + } + + // Then add a new candidate starting at the currently found keyword + // for each subexpression that matches this keyword + for (; mexp != NULL; mexp = _match_iter.next()) + { + QueryNode* pexp = mexp->_parent; + assert(pexp); + MatchCandidate* nm = NewCandidate(pexp); + if (!nm || nm->elems() < 0) { + LOG(error, "Matcher could not allocate memory for candidate - bailing out"); + if (nm) DerefCandidate(nm); + return false; + } + match_sequence& cs = _wrk_set[pexp->_node_idx]; + if (cs.size() >= _max_match_candidates) { + DerefCandidate(nm); + LOG(debug, "The max number of match candidates (%zu) in the work set for query node idx '%u' has been reached. " + "No more candidates are added", _max_match_candidates, pexp->_node_idx); + } else { + cs.push_back(nm); + } + update_wrk_set(cs, k, mexp); + } + return true; +} + + + +void Matcher::update_wrk_set(match_sequence& ws, MatchElement* k, QueryExpr* mexp) +{ + if (LOG_WOULD_LOG(spam)) { + std::string s; k->dump(s); + LOG(spam, "update_wrk_set(): match_sequence.size(%zu), element(%s)", ws.size(), s.c_str()); + } + + // update this working set (start with the freshest) + for (match_sequence::reverse_iterator rit = ws.rbegin(); rit != ws.rend();) { + MatchCandidate* m = (*rit); + + MatchCandidate::accept_state as = m->accept(k, mexp); + + // If a candidate already has this keyword, then all earlier + // candidates also has the keyword + if (as == MatchCandidate::M_EXISTS) break; + + + // Just accepted this candidate into another higher level + if (as != MatchCandidate::M_OVERLAP) { + MatchCandidate* mu = k->Complex(); + RefCandidate(mu); + } + + // we should allow a slighly larger winsize here because we have not found all matches yet. + if ((as == MatchCandidate::M_EXPIRED) || ((k->startpos() - m->startpos()) >= static_cast<int>(_winsizeFallback))) { + // remove from current pos and delete - can never be satisfied + match_sequence::reverse_iterator new_rit(ws.erase((++rit).base())); + rit = new_rit; + DerefCandidate(m); + } else { + // If this one got complete, move it to the ranked set or trigger updates + // of parent candidates if subquery match + if (m->complete()) { + // STL hackers' heaven - removing this element unconditionally from _wrk_set['k'] + match_sequence::reverse_iterator new_rit(ws.erase((++rit).base())); + rit = new_rit; + + if (m->matches_limit()) { + if (_need_complete_cnt > 0) { + _need_complete_cnt--; + } + update_match(m); + } else { + DerefCandidate(m); + } + } else { + ++rit; + } + } + } + if (LOG_WOULD_LOG(spam)) { + std::string s; k->dump(s); + LOG(spam, "END update_wrk_set, '%s'", s.c_str()); + } +} + + +// Flush all remaining candidates upon context change or document end: +void Matcher::flush_candidates() +{ + int cands = 0; + for (size_t i = 0; i < _nontermcnt; i++) { + match_sequence& ws = _wrk_set[i]; + for (match_sequence::iterator it = ws.begin(); it != ws.end(); ++it) { + cands++; + MatchCandidate* m = (*it); + if (m->partial_ok()) + update_match(m); + else + DerefCandidate(m); + } + ws.clear(); + } + LOG(debug, "Flushing done (%d candidates)", cands); +} + + +void Matcher::set_log(unsigned long log_mask) +{ + _log_mask = log_mask; +} + + +void Matcher::handle_token(Token& token) +{ + if (LOG_WOULD_LOG(debug)) { + char utf8token[1024]; + Fast_UnicodeUtil::utf8ncopy(utf8token, token.token, 1024, + (token.token != NULL ? token.curlen : 0)); + LOG(debug, "handle_token(%s)", utf8token); + } + + unsigned options = 0; + if (_mo->Match(_match_iter, token, options)) { + // Found a match. Record it with original pos and length + add_occurrence(token.bytepos, token.wordpos, token.bytelen); + } + // Keep track of end of the text + _endpos = token.bytepos + token.bytelen; +} + + +void Matcher::handle_end(Token& token) +{ + if (LOG_WOULD_LOG(debug)) { + char utf8token[1024]; + Fast_UnicodeUtil::utf8ncopy(utf8token, token.token, 1024, + (token.token != NULL ? token.curlen : 0)); + LOG(debug, "handle_end(%s)", utf8token); + } + if (LOG_WOULD_LOG(spam)) { + dump_occurrences(100); + LOG(spam, "Topmost 10 matches found:"); + dump_matches(10, false); + } + JL(JD_MDUMP, log_matches(20)); + // Just keep track of end of the text + _endpos = token.bytepos; + // flush here for now since we do not traverse all the nonterminal lists for each kw. + flush_candidates(); +} + + +void Matcher::dump_matches(int printcount, bool best) +{ + assert(!best); // This functionality removed + match_candidate_set& m = _matches; + + if (!best) { + // flush the remaining match candidates to the list of matches, if any: + flush_candidates(); + } + int i = 0; + std::ostringstream oss; + oss << "dump_matches(" << m.size() << "):\n"; + i = 0; + for (match_candidate_set::iterator it = m.begin(); it != m.end(); ++it) { + if (i >= printcount) break; +// if ((*it)->distance() == 0) break; + std::string s; + (*it)->dump(s); + oss << s << "\n"; + i++; + } + LOG(spam, "%s", oss.str().c_str()); +} + + +void Matcher::log_matches(int printcount) +{ + int nterms = QueryTerms(); + match_candidate_set& m = _matches; + + // flush the remaining match candidates to the list of matches, if any: + flush_candidates(); + char buf[200]; + + int i = 0; + _log_text.append("<table>"); + if (m.size() > 0) { + _log_text.append("<tr class=shade>"); + sprintf(buf, "<td colspan=%d align=center><b>Topmost %d matches out of %d", + nterms+2, std::min(printcount, m.size()),m.size()); + _log_text.append(buf); + _log_text.append("</b></td></tr>"); + } + _log_text.append("<tr class=shadehead>"); + for (i = 0; i < nterms; i++) { + _log_text.append("<td>"); + _log_text.append(_mo->Term(i)->term()); + _log_text.append("</td>"); + } + if (m.size() > 0) { + _log_text.append("<td align=right>distance</td><td align=right>rank</td></tr>\n"); + i = 0; + for (match_candidate_set::iterator it = m.begin(); it != m.end(); ++it) + { + if (i >= printcount) break; + _log_text.append("<tr class=shade>"); + (*it)->log(_log_text); + _log_text.append("</tr>"); + i++; + } + } + _log_text.append("<tr class=shadehead>"); + sprintf(buf, "<td colspan=%d align=center><b>Total(exact) keyword hits</b></td>", + nterms); + _log_text.append(buf); + _log_text.append("</tr><tr class=shade>"); + for (i = 0; i < nterms; i++) { + sprintf(buf, "<td>%d(%d)</td>", TotalMatchCnt(i), ExactMatchCnt(i)); + _log_text.append(buf); + } + _log_text.append("</tr></table>"); +} + + + +void Matcher::dump_occurrences(int printcount) +{ + std::ostringstream oss; + oss << "dump_occurrences:\n"; + int i = 0; + for (key_occ_vector::iterator kit = _occ.begin(); kit != _occ.end(); ++kit) { + std::string s; + (*kit)->dump(s); + oss << s << "\n"; + i++; + if (i > printcount) { + oss << "...cont...\n"; + break; + } + } + LOG(spam, "%s", oss.str().c_str()); +} + + +void Matcher::dump_statistics() +{ + int i; + int nterms = QueryTerms(); + + fprintf(stderr, "%20s %12s %12s\n", "Term", "Matches", "Exact"); + for (i = 0; i < nterms; i++) { + QueryTerm* q = _mo->Term(i); + fprintf(stderr, "%20s %12d %12d\n", q->term(), q->total_match_cnt, + q->exact_match_cnt); + } +} + + + +// Debugging/testing: + +int Matcher::TotalMatchCnt(int number) +{ + if (number < QueryTerms() && number >= 0) + return _mo->Term(number)->total_match_cnt; + else + return 0; +} + + +int Matcher::ExactMatchCnt(int number) +{ + if (number < QueryTerms() && number >= 0) + return _mo->Term(number)->exact_match_cnt; + else + return 0; +} + + +const char* Matcher::QueryTermText(int term_no) +{ + return _mo->Term(term_no)->term(); +} + + +std::string Matcher::GetLog() +{ + return _log_text; +} + + +SummaryDesc* Matcher::CreateSummaryDesc(size_t length, size_t min_length, int max_matches, + int surround_len) +{ + // No point in processing this document if no keywords found at all: + if (TotalHits() <= 0) return NULL; + + LOG(debug, "Matcher: sum.desc (length %lu, min_length %lu, max matches %d, " + "surround max %d)", + static_cast<unsigned long>(length), + static_cast<unsigned long>(min_length), + max_matches, surround_len); + return new SummaryDesc(this, length, min_length, max_matches, surround_len); +} + + +// This should rather be called ProximityRank() now: +long Matcher::GlobalRank() +{ + // Proximity ranking only applies to multi term queries, return a constant + // in all other cases: + if (QueryTerms() <= 1) return _proximity_noconstraint_offset; + + match_candidate_set::iterator it = _matches.begin(); +#ifdef JUNIPER_1_0_RANK + if (it == _matches.end()) return 0; + + // Rank is computed as the rank of the best match within the document + // boosted with the total number of found occurrences of any of the words in the query + // normalized by the number of words in the query: + return ((*it)->rank() >> 3) + ((TotalHits()/nterms) << 2); +#else + // Rank is computed as the rank of the 3 best matches within the document + // with each subsequent match counting 80% of the previous match. + // + long rank_val = 0; + const int quotient = 5; + const int prod = 4; + int r_quotient = 1; + int r_prod = 1; + const int best_matches = 3; // candidate(s) for parametrisation! + + for (int i = 0; i < best_matches && it != _matches.end(); i++) { + rank_val += (((*it)->rank()*r_prod/r_quotient) >> 4); + r_quotient *= quotient; + r_prod *= prod; + ++it; + } + + // Return negative weight of no hits and any of the explicit limits in effect + // Eg. NEAR/WITHIN but make exception for PHRASE since that is better + //handled by the index in the cases where there are more information at that stage: + if (!rank_val && _mo->HasConstraints()) + return 0; + + // shift down to a more suitable range for fsearch. Multiply by configured boost + // Add configured offset + return (long)((double)(rank_val >> 1) * _proximity_factor) + _proximity_noconstraint_offset; +#endif +} + + +/* These operations can be performed after the matcher is no longer existing.. + * + */ +std::string BuildSummary(const char* buffer, size_t buflen, SummaryDesc* summary, + const SummaryConfig* config, size_t& char_size) +{ + return summary->get_summary(buffer, buflen, config, char_size); +} + + +void DeleteSummaryDesc(SummaryDesc* s) +{ + LOG(debug, "Matcher: deleting SummaryDesc"); + delete s; +} diff --git a/juniper/src/vespa/juniper/Matcher.h b/juniper/src/vespa/juniper/Matcher.h new file mode 100644 index 00000000000..01f230c8f39 --- /dev/null +++ b/juniper/src/vespa/juniper/Matcher.h @@ -0,0 +1,204 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#pragma once + +#include "keyocc.h" +#include "mcand.h" +#include "queryhandle.h" + +// #define USE_OLD_SCANNER 1 + + +#ifdef USE_OLD_SCANNER +#define TokenDispatcher DocScanner +#endif + +#include <vector> +#include <list> +#include "multiset.h" +#include <map> +#include <string> +#include "ITokenProcessor.h" +#include "querynode.h" +#include "matchobject.h" +#include "querymodifier.h" + +#ifdef __hpux__ +// HP-UX does "magic" with max and min macros so algorithm must have been included +// before we attempt to define the macros.. +#include <algorithm> +#endif + +class MatchCandidate; + +// Define this to get hash algorithm to do keyword comparisons +// O(1) wrt. number of query words +#define USE_HASHED_KEYCMP 1 + +#ifdef USE_HASHED_KEYCMP +#include "hashbase.h" +#endif + +/* Max number of terms to do matching for */ +#define MAXTERMS 20 + + +class SummaryDesc; +class SummaryConfig; +class QueryTerm; + +typedef std::list<MatchCandidate*> match_sequence; + +class Matcher : public ITokenProcessor +{ +public: + Matcher(juniper::Result* result); + virtual ~Matcher(); + + Matcher& SetProximityFactor(float proximity_factor); + + /** Call reset_document upon a new document */ + void reset_document(); + void log_document(long id); + + /** Enable logging (generation of long string) + * @param log_mask The log feature bits to turn on + */ + void set_log(unsigned long log_mask); + + /** Token handlers to be called by tokenization step */ + virtual void handle_token(Token& token); + virtual void handle_end(Token& token); + + /** Utilities for dump to standard output */ + void dump_matches(int printcount = 10, bool best = false); + void dump_occurrences(int printcount); + void dump_statistics(); + + /** Utilities for logging to log output buffer */ + void log_matches(int printcount = 10); + + /** Observers: + * @param number - the number of the keyword in the order added. + * @return occurrences of this keyword within document + */ + int TotalMatchCnt(int number); + int ExactMatchCnt(int number); + + inline int QueryTerms() { return _mo->TermCount(); } + const char* QueryTermText(int term_no); + + inline const key_occ_vector& OccurrenceList() { return _occ; } + + // This should ideally be const but no support for const iterators in our multiset: + inline match_candidate_set& OrderedMatchSet() { return _matches; } + + inline const match_sequence* GetWorkSet() const { return _wrk_set; } + + /* @return Number of hits of any keywords within document */ + inline int TotalHits() { return _occ.size(); } + + /* @return true if this matcher has constraints (NEAR/WITHIN/PHRASE..) + * applied to the selected match candidate set + */ + inline bool HasConstraints() { return _mo->HasConstraints(); } + /* @return true if this matcher uses the validity bits on keyword occurrences */ + inline bool UsesValid() { return _mo->UsesValid(); } + + long GlobalRank(); + + // Current size of the document in progress.. + inline size_t DocumentSize() { return _endpos; } + + SummaryDesc* CreateSummaryDesc(size_t length, size_t min_length, + int max_matches, + int surround_len); + + /** Get the log string for this matcher or the empty string if no log enabled */ + std::string GetLog(); + + /** Returns the query used by the underlying match object */ + QueryExpr * getQuery() { return _mo->Query(); } + +protected: + /* Internal utilities + * Those that may fail will return false upon failure. + */ + bool add_occurrence(off_t pos, off_t tpos, size_t len); + void reset_matches(); + void reset_occurrences(); + + void update_match(MatchCandidate* m); + void update_wrk_set(match_sequence& ws, MatchElement* k, QueryExpr* mexp); + + // factory methods for creating/referencing/dereferencing MatchCandidates: + MatchCandidate* NewCandidate(QueryExpr*) __attribute__((noinline)); + MatchCandidate* RefCandidate(MatchCandidate* m); + void DerefCandidate(MatchCandidate* m); +private: + Result* _result; + QueryHandle* _qhandle; + MatchObject* _mo; + match_iterator _match_iter; + + // char* _s; + // the distance (in characters) between two tokens for them to be considered + // within same match ("window size" during matching.. + size_t _winsize; + // Window size used until max_matches has been found. + size_t _winsizeFallback; + // The max number of match candidates to manage in the work set for a non-leaf query node. + size_t _max_match_candidates; + + // A constant to add to the proximity rank value in cases where there are no + // constraints: + size_t _proximity_noconstraint_offset; + double _proximity_factor; + + // if set to >0 attempt to get as many complete matches before + // winsize is put into effect + int _need_complete_cnt; + + // Internal state + size_t _endpos; // The last valid position from the token pipeline + + size_t _nontermcnt; // The number of nonterminals in the query + // The sequence of occurrences of the search terms in the document + key_occ_vector _occ; + + // the current working set of match candidates. This set is now an + // array of subsets that are the working sets of each query non-terminal + // Size of this array determined by number of non-terminals in the query + // using QueryNode->node_idx as lookup. + match_sequence* _wrk_set; + + // The set of completed match candidates in descending order + match_candidate_set _matches; + + off_t _ctxt_start; + unsigned long _log_mask; // _log_text: a built-up text object with log selectively + std::string _log_text; // enabled by _log_mask bits + + Matcher(Matcher &); + Matcher &operator=(Matcher &); + + void flush_candidates(); + bool markup(const char* t, int len, off_t pos); + + void pushcontext(int ctxt); + void popcontext(int ctxt); +}; + +/** Other utilities exposed for debug purposes + */ +bool wordchar(const unsigned char* s); +bool nonwordchar(const unsigned char* s); + +/** Actually build / release the textual summary from a description. + * These functions is not dependent of any Matcher info. + */ +std::string BuildSummary(const char* buffer, size_t buflen, SummaryDesc* summary, + const SummaryConfig* config, size_t& char_size); +void DeleteSummaryDesc(SummaryDesc*); + diff --git a/juniper/src/vespa/juniper/SummaryConfig.cpp b/juniper/src/vespa/juniper/SummaryConfig.cpp new file mode 100644 index 00000000000..8f9eeef8ed2 --- /dev/null +++ b/juniper/src/vespa/juniper/SummaryConfig.cpp @@ -0,0 +1,108 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.summaryconfig"); +#define _NEED_SUMMARY_CONFIG_IMPL 1 +#include "SummaryConfig.h" +#include <string> +#include <sstream> + +SummaryConfig* CreateSummaryConfig(const char* highlight_on, + const char* highlight_off, + const char* dots, + const char* separators, + const unsigned char* connectors, + const ConfigFlag escape_markup, + const ConfigFlag preserve_white_space) +{ + return new SummaryConfig(highlight_on, highlight_off, dots, separators, + connectors, escape_markup, preserve_white_space); +} + + +void DeleteSummaryConfig(SummaryConfig*& sumconf) +{ + delete sumconf; + sumconf = NULL; +} + + +inline char hexchar(const char* s) +{ + const char* str = s; + unsigned char c = 0; + for (int i = 0; i < 2; i++) + { + if (*str <= 'F') + c |= (*str - '0'); + else + c |= (*str - 'a' + 10); + c = c << ((1 - i)*4); + str++; + } + return (char)c; +} + + +SummaryConfig::SummaryConfig(const char* hi_on, const char* hi_off, + const char* usedots, const char* separators, + const unsigned char* connectors, + ConfigFlag esc_markup, + ConfigFlag preserve_white_space_) + : _highlight_on(""), + _highlight_off(""), + _dots(""), + _separator(), + _connector(), + _escape_markup(esc_markup), + _preserve_white_space(preserve_white_space_) +{ + init(_highlight_on, hi_on); + init(_highlight_off, hi_off); + init(_dots, usedots); + + for (const char* c = separators; *c != '\0'; c++) { + if (*c > 0) _separator.set(*c, 1); + } + for (const unsigned char* uc = connectors; *uc != '\0'; uc++) { + if (*uc > 0) _connector.set(*uc, 1); + } +} + +void SummaryConfig::init(std::string& cf, const char* str) +{ + bool escape = false; + for (;str && *str != '\0'; str++) { + if (!escape && *str == '\\') { + escape = true; + } else { + if (escape) { + // Allow space to be encoded as \_ (fsearchrc does not accept spaces..) + if (*str == '_') { + cf += ' '; + escape = false; + continue; + } else if (isxdigit(*str) && isxdigit(*(str+1))) { + cf += hexchar(str); + str++; + escape = false; + continue; + } else { + escape = false; + } + } + cf += *str; + } + } +} + + +ConfigFlag StringToConfigFlag(const char* confstring) +{ + if (strcmp(confstring, "off") == 0) + return CF_OFF; + if (strcmp(confstring, "on") == 0) + return CF_ON; + // default: + return CF_AUTO; +} diff --git a/juniper/src/vespa/juniper/SummaryConfig.h b/juniper/src/vespa/juniper/SummaryConfig.h new file mode 100644 index 00000000000..f5d19897c74 --- /dev/null +++ b/juniper/src/vespa/juniper/SummaryConfig.h @@ -0,0 +1,65 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once +#include <string> + +enum ConfigFlag { + CF_OFF, + CF_ON, + CF_AUTO, + CF_MAXVAL +}; + + +/* Query highlight parameter class */ + +#ifndef _NEED_SUMMARY_CONFIG_IMPL +class SummaryConfig; +#else +#include <bitset> + +class SummaryConfig +{ +public: + SummaryConfig(const char* hi_on, const char* hi_off, + const char* usedots, const char* separators, + const unsigned char* connectors, + ConfigFlag esc_markup, + ConfigFlag preserve_white_space_); + + inline const std::string & highlight_on() const { return _highlight_on; } + inline const std::string & highlight_off() const { return _highlight_off; } + inline const std::string & dots() const { return _dots; } + inline bool separator(const char c) const { return (c < 0 ? false : _separator.test(c)); } + inline bool connector(const unsigned char c) const { return _connector.test(c); } + inline ConfigFlag escape_markup() const { return _escape_markup; } + inline ConfigFlag preserve_white_space() const { return _preserve_white_space; } + + +protected: + void init(std::string&, const char*); +private: + std::string _highlight_on; + std::string _highlight_off; + std::string _dots; + std::bitset<128> _separator; // Identify characters that should be removed in a teaser + std::bitset<256> _connector; // Identify characters that connects two tokens into one + ConfigFlag _escape_markup; + ConfigFlag _preserve_white_space; +}; + +#endif + + +ConfigFlag StringToConfigFlag(const char* confstring); + + +SummaryConfig* CreateSummaryConfig(const char* highlight_on, + const char* highlight_off, + const char* dots, + const char* separators, + const unsigned char* connectors, + const ConfigFlag escape_markup = CF_AUTO, + const ConfigFlag preserve_white_space = CF_OFF); + +void DeleteSummaryConfig(SummaryConfig*& sumconf); + diff --git a/juniper/src/vespa/juniper/appender.h b/juniper/src/vespa/juniper/appender.h new file mode 100644 index 00000000000..58bda1101e4 --- /dev/null +++ b/juniper/src/vespa/juniper/appender.h @@ -0,0 +1,141 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +/* $Id: */ + +namespace juniper { + +class Appender +{ +private: + const SummaryConfig *_sumconf; + bool _escape_markup; + bool _preserve_white_space; + bool _last_was_space; + size_t _char_len; + + inline void append(std::vector<char> & s, char c) { + JD_INVAR(JD_INPUT, c != 0, return,\ + LOG(warning, "Document source contained 0-bytes")); + // eliminate separators: + if (_sumconf->separator(c)) { + return; + } + + // eliminate multiple space characters + if (!_preserve_white_space) { + if (c > 0 && isspace(c)) { + if (_last_was_space) { + return; + } else { + _last_was_space = true; + } + c = ' '; // Never output newline or tab + } else { + _last_was_space = false; + } + } + + bool handled_as_markup; + if (_escape_markup) { + handled_as_markup = true; + switch (c) { + case '<': + s.push_back('&'); + s.push_back('l'); + s.push_back('t'); + s.push_back(';'); + break; + case '>': + s.push_back('&'); + s.push_back('g'); + s.push_back('t'); + s.push_back(';'); + break; + case '"': + s.push_back('&'); + s.push_back('q'); + s.push_back('u'); + s.push_back('o'); + s.push_back('t'); + s.push_back(';'); + break; + case '&': + s.push_back('&'); + s.push_back('a'); + s.push_back('m'); + s.push_back('p'); + s.push_back(';'); + break; + case '\'': + s.push_back('&'); + s.push_back('#'); + s.push_back('3'); + s.push_back('9'); + s.push_back(';'); + break; + default: + handled_as_markup = false; + break; + } + if (handled_as_markup) { + _char_len++; + } + } else { + handled_as_markup = false; + } + + if (!handled_as_markup) { + s.push_back(c); + /** If at start of an UTF8 character (both highest bits or none of them set) + * another char is accumulated.. + */ + if (!(c & 0x80) || (c & 0x40) ) { + _char_len++; + } + } + } + +public: + Appender(const SummaryConfig *sumconf) + : _sumconf(sumconf), + _escape_markup(false), + _preserve_white_space(false), + _last_was_space(false), + _char_len(0) + { + ConfigFlag esc_conf = _sumconf->escape_markup(); + + switch (esc_conf) { + case CF_OFF: + _escape_markup = false; + break; + case CF_ON: + _escape_markup = true; + break; + case CF_AUTO: + _escape_markup = (_sumconf->highlight_on()[0] == '<' || + _sumconf->highlight_off()[0] == '<' || + _sumconf->dots()[0] == '<'); + break; + default: + assert(false); + break; + } + + if (_sumconf->preserve_white_space() == CF_ON) { + _preserve_white_space = true; + } + } + + size_t charLen() const { return _char_len; } + + void append(std::vector<char>& s, const char* ds, int length) { + for (int i = 0; i < length; i++) { + append(s, ds[i]); + } + } +}; + +} // end namespace juniper + diff --git a/juniper/src/vespa/juniper/charutil.h b/juniper/src/vespa/juniper/charutil.h new file mode 100644 index 00000000000..33d87707ee8 --- /dev/null +++ b/juniper/src/vespa/juniper/charutil.h @@ -0,0 +1,18 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +namespace juniper +{ + +template <typename T> +int strncmp(const T* s1, const T* s2, size_t n) +{ + size_t i = 0; + for (; i < n; i++) + if (s1[i] != s2[i]) break; + if (i == n) return 0; + return (int)s1[i] - (int)s2[i]; +} + +} // end namespace juniper + diff --git a/juniper/src/vespa/juniper/config.cpp b/juniper/src/vespa/juniper/config.cpp new file mode 100644 index 00000000000..d9ff99a0207 --- /dev/null +++ b/juniper/src/vespa/juniper/config.cpp @@ -0,0 +1,89 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.config"); +#include "config.h" +#include "IJuniperProperties.h" +#include "rpinterface.h" +#include "juniperdebug.h" +#define _NEED_SUMMARY_CONFIG_IMPL +#include "SummaryConfig.h" + +namespace juniper +{ + +Config::Config(const char* config_name, Juniper & juniper) : + _docsumparams(), + _matcherparams(), + _sumconf(NULL), + _config_name(config_name), + _juniper(juniper) +{ + std::string separators = ""; + separators += UNIT_SEPARATOR; + separators += GROUP_SEPARATOR; + + const char* high_on = GetProp("dynsum.highlight_on", "<b>"); + const char* high_off = GetProp("dynsum.highlight_off", "</b>"); + const char* contsym = GetProp("dynsum.continuation", "..."); + const char* fallback = GetProp("dynsum.fallback", "none"); + size_t summarylength = atoi(GetProp("dynsum.length", "256")); + size_t sum_minlength = atoi(GetProp("dynsum.min_length", "128")); + size_t stem_min = atoi(GetProp("stem.min_length", "5")); + size_t stem_extend = atoi(GetProp("stem.max_extend", "3")); + size_t surround_max = atoi(GetProp("dynsum.surround_max", "128")); + size_t max_matches = atoi(GetProp("dynsum.max_matches", "3")); + const char* escape_markup = GetProp("dynsum.escape_markup", "auto"); + const char* preserve_white_space = GetProp("dynsum.preserve_white_space", "off"); + size_t match_winsize = strtol(GetProp("matcher.winsize", "200"), NULL, 0); + size_t max_match_candidates = atoi(GetProp("matcher.max_match_candidates", "1000")); + const char* seps = GetProp("dynsum.separators", separators.c_str()); + const unsigned char* cons = + reinterpret_cast<const unsigned char*>(GetProp("dynsum.connectors", + separators.c_str())); + double proximity_factor = strtod(GetProp("proximity.factor", "0.25"), NULL); + // Silently convert to something sensible + if (proximity_factor > 1E8 || proximity_factor < 0) proximity_factor = 0.25; + + _sumconf = CreateSummaryConfig(high_on, high_off, contsym, seps, cons, + StringToConfigFlag(escape_markup), + StringToConfigFlag(preserve_white_space)); + _docsumparams.SetEnabled(true) + .SetLength(summarylength).SetMinLength(sum_minlength) + .SetMaxMatches(max_matches) + .SetSurroundMax(surround_max) + .SetFallback(fallback); + _matcherparams.SetWantGlobalRank(true) + .SetStemMinLength(stem_min).SetStemMaxExtend(stem_extend) + .SetMatchWindowSize(match_winsize) + .SetMaxMatchCandidates(max_match_candidates) + .SetWordFolder(& _juniper.getWordFolder()) + .SetProximityFactor(proximity_factor); +} + +Config::~Config() +{ + DeleteSummaryConfig(_sumconf); +} + +const char* Config::GetProp(const char* name, const char* def) +{ + std::string propstr = _config_name.c_str(); + propstr += '.'; + propstr.append(name); + if (_config_name == "juniper") { + return _juniper.getProp().GetProperty(propstr.c_str(), def); + } else { + const char* p = _juniper.getProp().GetProperty(propstr.c_str(), NULL); + if (p == NULL) { + propstr = "juniper."; + propstr.append(name); + p = _juniper.getProp().GetProperty(propstr.c_str(), def); + } + return p; + } +} + +} // end namespace juniper diff --git a/juniper/src/vespa/juniper/config.h b/juniper/src/vespa/juniper/config.h new file mode 100644 index 00000000000..f7586d187bb --- /dev/null +++ b/juniper/src/vespa/juniper/config.h @@ -0,0 +1,36 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "juniperparams.h" +#include "dpinterface.h" + +class IJuniperProperties; + +namespace juniper +{ + +class IReducer; +class IExpander; +class Juniper; + +class Config +{ +public: + Config(const char* config_name, Juniper & juniper); + ~Config(); + const char* GetProp(const char* name, const char* def); + + DocsumParams _docsumparams; + MatcherParams _matcherparams; + SummaryConfig * _sumconf; + +private: + std::string _config_name; + Juniper & _juniper; + + Config(Config &); + Config &operator=(Config &); +}; + +} // end namespace juniper + diff --git a/juniper/src/vespa/juniper/dpinterface.cpp b/juniper/src/vespa/juniper/dpinterface.cpp new file mode 100644 index 00000000000..5d72774fc8a --- /dev/null +++ b/juniper/src/vespa/juniper/dpinterface.cpp @@ -0,0 +1,7 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.dpinterface"); +#include "dpinterface.h" diff --git a/juniper/src/vespa/juniper/dpinterface.h b/juniper/src/vespa/juniper/dpinterface.h new file mode 100644 index 00000000000..f5bc9b50657 --- /dev/null +++ b/juniper/src/vespa/juniper/dpinterface.h @@ -0,0 +1,102 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#pragma once + +/** @file dpinterface.h This file is the main include file for inetgrators of the document + * processing/indexing stages of Juniper specific processing. For integrating + * result processing (core Juniper - highlighting/proximity metric computation) refer to + * rpinterface.h + */ + +/** The GS character used to separate paragraphs */ +#define GROUP_SEPARATOR 0x1D + +/** The US character used to separate words in CJK texts */ +#define UNIT_SEPARATOR 0x1F + + +namespace juniper +{ + +/** class Tokentype Hint as to which type of token this is. + * If this information is already aggregated by the caller + * it allows us to save som extra computation in Juniper. + */ +enum Tokentype +{ + TOKEN_UNKNOWN, // token type info not present. + TOKEN_WORD, // This is a word token + TOKEN_SEP, // This is a separator token + TOKEN_MARKUP, // This token contains general unspecified markup + TOKEN_OTHER, // This token is something else than any of the above + TOKEN_MAX // Max token types currently supported. +}; + +/** Opaque reference to the Juniper internal representation of a document summary + * Allows transport of Juniper information between different stages of the + * document processing without having to serialize/deserialize for each such step. + */ +class Docsum; + +/** @class DocsumProcessor + * Interface for Document processors specific for and + * implemented in Juniper. + * that operate on doc summaries (at proper places in the document processing pipelines) + * to enhance and annotate the source for Juniper result processing (see rpinterface.h) + */ +class DocsumProcessor +{ +public: + virtual ~DocsumProcessor() {} + + /** Process a docsum with this processor. Processing can in the cases where + * token based processing is necessary just be implemented as setting + * the document summary to do processing for, but can also yield a complete + * processing. + * @param docsum_input a previously serialized Docsum object or an UTF-8 string + * @param length Length in bytes of the docsum_input object + * @return false if the operation failed, true otherwise + */ + virtual bool Process(const char* docsum_input, size_t length) = 0; + + /** Process a docsum with this processor + * @param docsum an input Docsum to process. This DocsumProcessor + * also takes responsibility for releasing the Docsum object if necessary, that is + * GetDocsum has not been called when this object is deleted, + * the Docsum gets released as well. + * @return false if the operation failed, true otherwise + */ + virtual bool Process(Docsum* docsum) = 0; + + /** Low level document processing + * @param rep A textual representation of the token to process + * @param start The start position of this token within the original text + * @param len Length of the token representation + * @param type The token type in question (to allow saving of + * processing time in Juniper) + * @return true if operation ok, false if failure to process + */ + virtual bool ProcessToken(const char* rep, off_t start, size_t len, Tokentype type) = 0; + + /** Retrieve a reference to the docsum representation + * @return The Docsum object including the current state of the docsum. + * This Docsum object must later be released by the caller using ReleaseDocsum + * or handed over to a subsequent processor. + */ + virtual Docsum* GetDocsum() = 0; + + /** Create a textual representation of the annotated docsum suitable for disk storage + * for later usage by Juniper result processing. + * @param length The length of the serialized docsum + * @return A pointer to the text representation of the docsum. This object + * is valid throughout the life of this document processor or until + * the next call to Serialize() for this processor. + */ + virtual const char* Serialize(size_t& length) = 0; +}; + +void ReleaseDocsum(Docsum* docsum); + +} // end namespace juniper + diff --git a/juniper/src/vespa/juniper/expcache.cpp b/juniper/src/vespa/juniper/expcache.cpp new file mode 100644 index 00000000000..181a80b07a2 --- /dev/null +++ b/juniper/src/vespa/juniper/expcache.cpp @@ -0,0 +1,29 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.expcache"); +#include "expcache.h" +#include "matchobject.h" + +ExpansionCache::ExpansionCache(MatchObject* default_obj) + : _default(default_obj), _cache() +{} + + +ExpansionCache::~ExpansionCache() +{ + // Delete all associated maps + _cache.delete_second(); +} + + +MatchObject* ExpansionCache::Lookup(uint32_t langid) +{ + MatchObject* m = _cache.find(langid); + if (!m) + { + m = new MatchObject(_default->Query(), _default->HasReductions(), langid); + _cache.insert(langid, m); + } + return m; +} diff --git a/juniper/src/vespa/juniper/expcache.h b/juniper/src/vespa/juniper/expcache.h new file mode 100644 index 00000000000..cc3384b36ed --- /dev/null +++ b/juniper/src/vespa/juniper/expcache.h @@ -0,0 +1,23 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "simplemap.h" + +class MatchObject; + +class ExpansionCache +{ +public: + explicit ExpansionCache(MatchObject* default_obj); + + virtual ~ExpansionCache(); + + MatchObject* Lookup(uint32_t langid); +private: + MatchObject* _default; + simplemap<uint32_t, MatchObject*> _cache; + + ExpansionCache(ExpansionCache &); + ExpansionCache &operator=(ExpansionCache &); +}; + diff --git a/juniper/src/vespa/juniper/foreach_utils.h b/juniper/src/vespa/juniper/foreach_utils.h new file mode 100644 index 00000000000..05d015890d9 --- /dev/null +++ b/juniper/src/vespa/juniper/foreach_utils.h @@ -0,0 +1,44 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once +#include <algorithm> + +/** \if utils + * A simple general deleter object to be passed to for instance std::for_each + * to delete pointer referenced objects + * in STL containers. + * + */ + +struct Deleter +{ + template <typename T> + void operator()(T* t) const + { + delete t; + } +}; + +/* \def Handy macro to delete all pointer objects in a container + * (using \a Deleter) + */ + +#define delete_all(container) \ + std::for_each(container.begin(), container.end(), Deleter()) + + +#define FunctionObj(name, func) \ + struct name \ + { \ + template <typename T> \ + void operator()(T* t) \ + { \ + t->func(); \ + } \ + } + + +#define for_all(container, obj) \ + std::for_each(container.begin(), container.end(), obj()) + +/** \endif (utils) */ + diff --git a/juniper/src/vespa/juniper/hashbase.h b/juniper/src/vespa/juniper/hashbase.h new file mode 100644 index 00000000000..f0f29238993 --- /dev/null +++ b/juniper/src/vespa/juniper/hashbase.h @@ -0,0 +1,362 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <stdio.h> +#include <assert.h> + +// Simple default order that everybody has - pointer order: +template <typename T> +struct PtrComparator +{ + inline bool operator()(T m1, T m2) + { + return m1 < m2; + } +}; + +template <typename Key, typename T, int _tableSize, typename Comparator = PtrComparator<T> > +class Fast_HashTable; + +template<typename Key, typename T> +class Fast_HashTableElement +{ +private: + Fast_HashTableElement(const Fast_HashTableElement&); + Fast_HashTableElement& operator=(const Fast_HashTableElement&); + +protected: + + Key _key; + Fast_HashTableElement *_next; + T _item; + +public: + + Fast_HashTableElement(Key key, + Fast_HashTableElement<Key, T> *next, + T item) + : _key(key), _next(next), _item(item) {} + ~Fast_HashTableElement(){} + + inline Fast_HashTableElement<Key, T> *GetNext(void) { return _next; } + inline void SetNext(Fast_HashTableElement<Key, T> *next) { _next = next; } + inline Key GetKey(void) { return _key; } + inline T GetItem(void) { return _item; } +}; + + + +template <typename Key, typename T, int _tableSize> +class Fast_HashTableIterator +{ + friend class Fast_HashTable<Key, T, _tableSize>; + +private: + + const Fast_HashTable<Key, T, _tableSize> *_hashTable; + int _index; + Fast_HashTableElement<Key, T> *_runner; //current element in list + +protected: + + Fast_HashTableIterator(const Fast_HashTable<Key, T, _tableSize>& hashTable) : _hashTable(&hashTable), _index(-1) + { + _runner = SearchNext(); + }; + + + Fast_HashTableElement<Key, T> *SearchNext(void) + { + Fast_HashTableElement<Key, T> *retVal = NULL; + + for (++_index; _index<_hashTable->_tableSize; _index++) + { + retVal = _hashTable->_lookupTable[_index]; + + if (retVal != NULL) + break; + } + + return retVal; + } + + +public: + + inline T GetCurrent() { return _runner->GetItem(); }; + inline Key GetCurrentKey() { return _runner->GetKey(); } + + inline void Next() + { + if (_runner != NULL) + { + _runner = _runner->GetNext(); + + if (_runner == NULL) + { + _runner = SearchNext(); + } + } + }; + + inline bool End() const { return _runner == NULL; }; + // becomes true when ++ on the last element + + inline void Rewind(void) + { + _runner = NULL; + _index = -1; + + _runner = SearchNext(); + }; +}; + + +// Basis for specialization +template <int v> +struct Int2Type +{ + enum { value = v }; +}; + + +template <typename Key, typename T, int _tableSize = 0x10, typename Comparator> +class Fast_HashTable +{ +private: + Fast_HashTable(Fast_HashTable &); + Fast_HashTable &operator=(Fast_HashTable &); + +public: + typedef Fast_HashTableElement<Key, T> element; + typedef Fast_HashTableIterator<Key, T, _tableSize> iterator; + typedef Key keytype; + + friend class Fast_HashTableIterator<Key, T, _tableSize>; + +protected: + + int _numElements; + element **_lookupTable; + Comparator _compare; + + inline int HashFunction(Key key, Int2Type<true>) + { + return key & (_tableSize-1); + } + + inline int HashFunction(Key key, Int2Type<false>) + { + return key % _tableSize; + } + + inline int HashFunction(Key key) + { + return HasFunction(key, Int2Type<(_tableSize & (_tableSize-1) == _tableSize)>()); + } + +public: + Fast_HashTable() : _numElements(0), _lookupTable(NULL), _compare() + { + typedef element dummyDef; + _lookupTable = new dummyDef* [_tableSize]; + memset(_lookupTable, 0, _tableSize * sizeof(element *)); + } + + + Fast_HashTableIterator<Key, T, _tableSize> *NewIterator(void) + { + return new iterator(*this); + } + + inline int ElementCount(void) { return _numElements; } + + inline void Clear(void) + { + if (_numElements == 0) return; + for (int i=0; i<_tableSize; i++) + { + element *curr, *prev=NULL; + + for (curr=_lookupTable[i]; curr != NULL; curr=curr->GetNext()) + { + if (prev != NULL) + { + delete prev; + _numElements--; + if (_numElements == 0) break; + } + prev = curr; + _lookupTable[i] = NULL; + } + + if (prev != NULL) delete prev; + } + } + + + Key Insert(Key key, T item) + { + int pos = HashFunction(key, Int2Type<((_tableSize & (_tableSize-1)) == _tableSize)>()); + + if (_lookupTable[pos] == NULL || !_compare(item, _lookupTable[pos]->GetItem())) + { + _lookupTable[pos] = new element(key, _lookupTable[pos], item); + } + else + { + element* pel = _lookupTable[pos]; + element* el = pel->GetNext(); + while (el && _compare(item, el->GetItem())) + { + pel = el; + el = el->GetNext(); + } + pel->SetNext(new element(key, el, item)); + } + + _numElements++; + + return _lookupTable[pos]->GetKey(); + } + + + T Find(Key key) + { + T retVal; + retVal = NULL; + + int pos = HashFunction(key, Int2Type<(_tableSize & (_tableSize-1) == _tableSize)>()); + + for (element *curr=_lookupTable[pos]; curr != NULL; curr=curr->GetNext()) + { + if (curr->GetKey() == key) + { + retVal = curr->GetItem(); + break; + } + } + + return retVal; + } + + + element* FindRef(Key key) + { + int pos = HashFunction(key, Int2Type<((_tableSize & (_tableSize-1)) == _tableSize)>()); + + for (element *curr=_lookupTable[pos]; curr != NULL; curr=curr->GetNext()) + if (curr->GetKey() == key) return curr; + return NULL; + } + + + T Remove(Key key) + { + T retVal = NULL; + + int pos = HashFunction(key, Int2Type<(_tableSize & (_tableSize-1) == _tableSize)>()); + + element *curr=_lookupTable[pos]; + element *prev = NULL; + + for (; curr != NULL; curr=curr->GetNext()) + { + if (curr->GetKey() == key) + { + retVal = curr->GetItem(); + break; + } + + prev = curr; + } + + if (curr != NULL) + { + if (prev != NULL) + { + prev->SetNext(curr->GetNext()); + } + else + { + _lookupTable[pos] = curr->GetNext(); + } + + _numElements--; + + delete curr; + } + + return retVal; + } + + + + void RemoveItem(T item) + { + for (int i=0; i<_tableSize; i++) + { + element *curr = _lookupTable[i]; + element *prev = NULL; + + while(curr != NULL) + { + if (item == curr->GetItem()) + { + // Found item to delete + element *toBeDeleted = curr; + + curr = curr->GetNext(); + + if (prev != NULL) + { + prev->SetNext(curr); + } + else + { + _lookupTable[i] = curr; + } + + _numElements--; + + delete toBeDeleted; + } + else + { + prev = curr; + curr = curr->GetNext(); + } + } + } + } + + + + void Print(void) + { + for (int i=0; i<_tableSize; i++) + { + if (_lookupTable[i] != NULL) + { + printf("[%i]", i); + + for (element *curr=_lookupTable[i]; curr != NULL; curr=curr->GetNext()) + { + printf(" -> %u", curr->GetKey()); + } + + printf("\n"); + } + } + } + + + + virtual ~Fast_HashTable(void) + { + Clear(); + delete [] _lookupTable; + } + +}; + diff --git a/juniper/src/vespa/juniper/juniperdebug.h b/juniper/src/vespa/juniper/juniperdebug.h new file mode 100644 index 00000000000..7f60d988341 --- /dev/null +++ b/juniper/src/vespa/juniper/juniperdebug.h @@ -0,0 +1,56 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +// Include something from STL so that _STLPORT_VERSION gets defined if appropriate +#include <string> + +/* Juniper debug macro */ + +#define JD_INFO 0x1 /* Useful information (verbose mode) */ +#define JD_PAR 0x2 /* Tracking parameter settings mm. */ +#define JD_DUMP 0x4 /* Dump statistics etc. */ +#define JD_JE 0x10 /* Juniper per query entry/exit */ +#define JD_ENT 0x100 /* Enter functions */ +#define JD_EXIT 0x200 /* Enter functions */ +#define JD_INPUT 0x400 /* Tracking input */ +#define JD_WCMP 0x1000 /* Word completion */ +#define JD_DESC 0x2000 /* Descriptor buildup */ +#define JD_SUMLEN 0x4000 /* Dynamic teaser length */ +#define JD_MDUMP 0x8000 /* Dumping found/qualified matches and match occurrences */ +#define JD_TOKEN 0x10000 /* Tokenization (verbose) */ +#define JD_ALLOC 0x20000 /* Allocations and deallocations */ +#define JD_PAR_V 0x40000 /* Parameter setting tracking (verbose) */ +#define JD_TOKBYT 0x100000 /* Use hexbyte token output (with JD_TOKEN) */ +#define JD_STACK 0x200000 /* Dump stack but do not attempt to process anything */ + +/* Logging to log object (juniperlog summary field) */ +#define JL(level, stmt) do { if (_log_mask & level) { stmt; } } while (0) + +#ifdef FASTOS_DEBUG +extern unsigned debug_level; +#define JD(level, stmt) do { if (debug_level & level) { stmt; } } while (0) +# warning "FASTOS_DEBUG is defined" + +/* Invariant checking */ + +#define JD_INVAR(level, condition, action, log) \ + do { if (!(condition)) { if (debug_level & level) { log; } action; } } while (0) +#else + +#define JD_INVAR(level, condition, action, log) \ + do { if (!(condition)) { action; } } while (0) +#define JD(level, stmt) + +#endif + + +#include "foreach_utils.h" + +FunctionObj(DoDump, dump); + +template <class _container> +void dump_list(_container& __c) +{ + for_all(__c, DoDump); +} + diff --git a/juniper/src/vespa/juniper/juniperparams.cpp b/juniper/src/vespa/juniper/juniperparams.cpp new file mode 100644 index 00000000000..0973c8a20b6 --- /dev/null +++ b/juniper/src/vespa/juniper/juniperparams.cpp @@ -0,0 +1,168 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.juniperparams"); +#include "juniperdebug.h" +#include "juniperparams.h" +#include "Matcher.h" + +// DocsumParams implementation: +// --------------------------------------------------------------- + +DocsumParams::DocsumParams() : + _enabled(false), _length(256), _min_length(128), _max_matches(3), + _surround_max(80), _space_chars(""), _fallback(FALLBACK_NONE) +{ } + +DocsumParams& DocsumParams::SetEnabled(bool en) +{ + _enabled = en; + return *this; +} + +DocsumParams& DocsumParams::SetLength(size_t length) +{ + _length = length; + return *this; +} + +DocsumParams& DocsumParams::SetMinLength(size_t length) +{ + _min_length = length; + return *this; +} + +DocsumParams& DocsumParams::SetMaxMatches(size_t matches) +{ + _max_matches = matches; + return *this; +} + +DocsumParams& DocsumParams::SetSurroundMax(size_t length) +{ + _surround_max = length; + return *this; +} + +DocsumParams& DocsumParams::SetSpaceChars(const char* spacechars) +{ + _space_chars = spacechars; + return *this; +} + +DocsumParams& DocsumParams::SetFallback(const char* fallback) +{ + if (strcmp("prefix", fallback) == 0) { + _fallback = FALLBACK_PREFIX; + } else { + _fallback = FALLBACK_NONE; + } + return *this; +} + +size_t DocsumParams::Length() const { return _length; } +size_t DocsumParams::MinLength() const { return _min_length; } +size_t DocsumParams::MaxMatches() const { return _max_matches; } +size_t DocsumParams::SurroundMax() const { return _surround_max; } +bool DocsumParams::Enabled() const { return _enabled; } +const char* DocsumParams::SpaceChars() const { return _space_chars.c_str(); } +int DocsumParams::Fallback() const { return _fallback; } + +// MatcherParams implementation: +// --------------------------------------------------------------- + + +MatcherParams::MatcherParams() : + _prefix_extend_length(3), + _prefix_min_length(5), + _match_winsize(200), + _match_winsize_fallback_multiplier(10.0), + _max_match_candidates(1000), + _want_global_rank(false), + _stem_min(0), _stem_extend(0), + _wordfolder(NULL), _proximity_factor(1.0) +{ } + + +MatcherParams& MatcherParams::SetPrefixExtendLength(size_t extend_length) +{ + _prefix_extend_length = extend_length; + return *this; +} + +MatcherParams& MatcherParams::SetPrefixMinLength(size_t min_length) +{ + _prefix_min_length = min_length; + return *this; +} + + +MatcherParams& MatcherParams::SetMatchWindowSize(size_t winsize) +{ + _match_winsize = winsize; + return *this; +} + +MatcherParams& MatcherParams::SetMatchWindowSizeFallbackMultiplier(double winsize) +{ + _match_winsize_fallback_multiplier = winsize; + return *this; +} + +MatcherParams& MatcherParams::SetMaxMatchCandidates(size_t max_match_candidates) +{ + _max_match_candidates = max_match_candidates; + return *this; +} + +MatcherParams& MatcherParams::SetWantGlobalRank(bool global_rank) +{ + _want_global_rank = global_rank; + return *this; +} + +MatcherParams& MatcherParams::SetStemMinLength(size_t stem_min) +{ + _stem_min = stem_min; + return *this; +} + + +MatcherParams& MatcherParams::SetStemMaxExtend(size_t stem_extend) +{ + _stem_extend = stem_extend; + return *this; +} + +size_t MatcherParams::PrefixExtendLength() const { return _prefix_extend_length; } +size_t MatcherParams::PrefixMinLength() const { return _prefix_min_length; } +size_t MatcherParams::MatchWindowSize() const { return _match_winsize; } +double MatcherParams::MatchWindowSizeFallbackMultiplier() const { return _match_winsize_fallback_multiplier; } +size_t MatcherParams::MaxMatchCandidates() const { return _max_match_candidates; } +bool MatcherParams::WantGlobalRank() const { return _want_global_rank; } +size_t MatcherParams::StemMinLength() const { return _stem_min; } +size_t MatcherParams::StemMaxExtend() const { return _stem_extend; } + + +MatcherParams& MatcherParams::SetWordFolder(Fast_WordFolder* wordfolder) +{ + _wordfolder = wordfolder; + return *this; +} + +Fast_WordFolder* MatcherParams::WordFolder() { return _wordfolder; } + + +MatcherParams& MatcherParams::SetProximityFactor(double factor) +{ + _proximity_factor = factor; + return *this; +} + +double MatcherParams::ProximityFactor() { return _proximity_factor; } + + +bool operator==(MatcherParams& mp1, MatcherParams& mp2) +{ + return memcmp(&mp1, &mp2, sizeof(MatcherParams)) == 0; +} diff --git a/juniper/src/vespa/juniper/juniperparams.h b/juniper/src/vespa/juniper/juniperparams.h new file mode 100644 index 00000000000..c5b97ec6d46 --- /dev/null +++ b/juniper/src/vespa/juniper/juniperparams.h @@ -0,0 +1,105 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <string> +#include <vespa/fastos/fastos.h> +#include <vespa/fastlib/text/wordfolder.h> + +class SummaryConfig; + +class DocsumParams +{ +public: + enum { + FALLBACK_NONE, + FALLBACK_PREFIX + }; + + DocsumParams(); + + DocsumParams& SetEnabled(bool en); + bool Enabled() const; + + DocsumParams& SetLength(size_t length); + size_t Length() const; + + DocsumParams& SetMinLength(size_t length); + size_t MinLength() const; + + DocsumParams& SetMaxMatches(size_t matches); + size_t MaxMatches() const; + + DocsumParams& SetSurroundMax(size_t length); + size_t SurroundMax() const; + + DocsumParams& SetSpaceChars(const char* spacechars); + const char* SpaceChars() const; + + DocsumParams& SetFallback(const char* fallback); + int Fallback() const; + +private: + bool _enabled; + size_t _length; + size_t _min_length; + size_t _max_matches; + size_t _surround_max; + std::string _space_chars; + int _fallback; +}; + + +class MatcherParams +{ +public: + MatcherParams(); + + MatcherParams& SetPrefixExtendLength(size_t extend_length); + size_t PrefixExtendLength() const; + + MatcherParams& SetPrefixMinLength(size_t min_length); + size_t PrefixMinLength() const; + + MatcherParams& SetMatchWindowSize(size_t winsize); + size_t MatchWindowSize() const; + + MatcherParams& SetMatchWindowSizeFallbackMultiplier(double winsize); + double MatchWindowSizeFallbackMultiplier() const; + + MatcherParams& SetMaxMatchCandidates(size_t max_match_candidates); + size_t MaxMatchCandidates() const; + + MatcherParams& SetWantGlobalRank(bool global_rank); + bool WantGlobalRank() const; + + MatcherParams& SetStemMinLength(size_t stem_min); + size_t StemMinLength() const; + + MatcherParams& SetStemMaxExtend(size_t stem_extend); + size_t StemMaxExtend() const; + + MatcherParams& SetWordFolder(Fast_WordFolder* wordfolder); + Fast_WordFolder* WordFolder(); + + MatcherParams& SetProximityFactor(double factor); + double ProximityFactor(); + +private: + size_t _prefix_extend_length; + size_t _prefix_min_length; + size_t _match_winsize; + double _match_winsize_fallback_multiplier; + size_t _max_match_candidates; + bool _want_global_rank; + size_t _stem_min; + size_t _stem_extend; + Fast_WordFolder* _wordfolder; // The wordfolder object needed as 1st parameter to folderfun + double _proximity_factor; + + MatcherParams(MatcherParams &); + MatcherParams &operator=(MatcherParams &); +}; + + +bool operator==(MatcherParams& mp1, MatcherParams& mp2); + diff --git a/juniper/src/vespa/juniper/keyocc.cpp b/juniper/src/vespa/juniper/keyocc.cpp new file mode 100644 index 00000000000..f37816a231c --- /dev/null +++ b/juniper/src/vespa/juniper/keyocc.cpp @@ -0,0 +1,29 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.keyocc"); +#include "keyocc.h" + +key_occ::key_occ(const char* term_, off_t spos, off_t stoken, int len) : + MatchElement(spos, stoken), + tokenlen(len), + _term(term_) +{ } + + +void key_occ::set_valid() +{ + _valid = true; +} + +void key_occ::add_to_keylist(keylist& kl) +{ + key_occ* k = this; + kl.insert(k); +} + + +void key_occ::dump(std::string& s) +{ + s.append(term()); +} diff --git a/juniper/src/vespa/juniper/keyocc.h b/juniper/src/vespa/juniper/keyocc.h new file mode 100644 index 00000000000..476a2c1a1be --- /dev/null +++ b/juniper/src/vespa/juniper/keyocc.h @@ -0,0 +1,37 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/fastos/fastos.h> +#include "multiset.h" +#include <vector> +#include "matchelem.h" +#include "querynode.h" + +typedef key_occ* key_occ_ptr; +typedef std::vector<key_occ_ptr> key_occ_vector; + + +class key_occ : public MatchElement +{ +public: + virtual void set_valid(); + virtual void add_to_keylist(keylist& kl); + virtual void dump(std::string& s); + virtual size_t length() const { return tokenlen; } + inline const char* term() { return _term; } + inline size_t word_length() const { return 1; } + inline bool complete() { return true; } + virtual inline off_t endpos() const { return _startpos + tokenlen; } + virtual inline off_t endtoken() const { return _starttoken + 1; } + + int tokenlen; + key_occ(const char* term, off_t posi, off_t tpos, int len); + +private: + const char* _term; // Pointer into first match (for debugging purposes only) + + key_occ(key_occ &); + key_occ &operator=(key_occ &); +}; + + diff --git a/juniper/src/vespa/juniper/matchelem.cpp b/juniper/src/vespa/juniper/matchelem.cpp new file mode 100644 index 00000000000..2646cb94b0f --- /dev/null +++ b/juniper/src/vespa/juniper/matchelem.cpp @@ -0,0 +1,13 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.matchelem"); +#include "matchelem.h" + +MatchElement::MatchElement(off_t spos, off_t stoken) : + _starttoken(stoken), + _startpos(spos), + _valid(false) +{ } diff --git a/juniper/src/vespa/juniper/matchelem.h b/juniper/src/vespa/juniper/matchelem.h new file mode 100644 index 00000000000..91cbb5e7d65 --- /dev/null +++ b/juniper/src/vespa/juniper/matchelem.h @@ -0,0 +1,54 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id: */ + +#pragma once + +#include <string> +#include "multiset.h" +#include "querynode.h" + +class Matcher; +class key_occ; +class MatchCandidate; + +/* Sequential ordering of elements */ +template <typename _Elem> +struct sequential_elem +{ + inline bool operator()(_Elem m1, _Elem m2) + { + return m1->starttoken() < m2->starttoken(); + } +}; +typedef JUNIPER_SET<key_occ*, sequential_elem<key_occ*> > keylist; + +class MatchElement +{ +public: + MatchElement(off_t startpos, off_t starttoken); + virtual ~MatchElement() {} + virtual void set_valid() = 0; // Mark this element and its subelements as valid + virtual void add_to_keylist(keylist& kl) = 0; + virtual void dump(std::string& s) = 0; + virtual size_t length() const = 0; + virtual size_t word_length() const = 0; + virtual bool complete() = 0; + virtual off_t endpos() const = 0; + virtual off_t endtoken() const = 0; + + // Word/token position of the first token in this match element + inline off_t starttoken() const { return _starttoken; } + + // byte position of the start of the first token in this match element + inline off_t startpos() const { return _startpos; } + + // Set if this match element is part of a valid match + inline bool valid() const { return _valid; } + + virtual MatchCandidate* Complex() { return NULL; } +protected: + off_t _starttoken; // The token number at which this element starts + off_t _startpos; // The byte number (byte pos) at which this element starts + bool _valid; // tag set if this match element is part of a valid match +}; + diff --git a/juniper/src/vespa/juniper/matchobject.cpp b/juniper/src/vespa/juniper/matchobject.cpp new file mode 100644 index 00000000000..1c66a485ec0 --- /dev/null +++ b/juniper/src/vespa/juniper/matchobject.cpp @@ -0,0 +1,425 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.matchobject"); +#include "query.h" +#include "matchobject.h" +#include "juniperdebug.h" +#include "result.h" +#include "querynode.h" +#include "charutil.h" +#include <vespa/fastlib/util/wildcard_match.h> +#include "querymodifier.h" +#include "queryhandle.h" +#include <stack> + +class traverser : public IQueryExprVisitor +{ +public: + traverser(MatchObject& mo) : _mo(mo) {} + + virtual void VisitQueryNode(QueryNode*) + { + // We must not add this node to nonterminals before all children has been added! + // Matcher::flush_candidates() depend on this order to avoid having to loop + // until no more candidates... + } + + virtual void RevisitQueryNode(QueryNode* n) + { + _mo.add_nonterm(n); + } + + virtual void VisitQueryTerm(QueryTerm* t) + { + if (t->rewriter && t->rewriter->ForDocument()) + _mo.add_reduction_term(t, t->rewriter); + else + _mo.add_queryterm(t); + } +private: + MatchObject& _mo; +}; + + +class query_expander : public IQueryExprVisitor +{ +public: + query_expander(MatchObject& mo, uint32_t langid) + : _caller(), _mo(mo), _langid(langid) {} + + virtual void VisitQueryTerm(QueryTerm* orig) + { + const char* nt = NULL; + size_t length; + juniper::RewriteHandle* te = NULL; + bool reduction = false; + + if (orig->rewriter) + { + // Check if expansions are necessary + if (orig->rewriter->ForQuery()) + { + te = orig->rewriter->Rewrite(_langid, orig->term()); + if (te) + nt = orig->rewriter->NextTerm(te, length); + } + + // If this rewriter is both an expander and a reducer, only matches + // of reduced forms will be valid, need to take steps to add expansions + // to a separate mapping + reduction = orig->rewriter->ForDocument(); + } + if (nt == NULL) + { + QueryTerm* t = new QueryTerm(orig); // No matches found, just clone term.. + if (!reduction) + _mo.add_queryterm(t); + else + _mo.add_reduction_term(t, orig->rewriter); + update(t); + return; + } + // Start expanding... + std::vector<QueryTerm*> newterms; + while (nt != NULL) + { + QueryTerm* nqt = new QueryTerm(nt, length, -1); + // Copy options but do not apply juniper stem match for expanded terms + nqt->_options = orig->_options | X_EXACT; + if (!reduction) + _mo.add_queryterm(nqt); + else + _mo.add_reduction_term(nqt, orig->rewriter); + newterms.push_back(nqt); + nt = orig->rewriter->NextTerm(te, length); + } + if (newterms.size() == 1) + { + update(newterms.front()); + return; + } + + QueryNode* qn = new QueryNode(newterms.size(), orig->_weight, orig->_weight); + // preserve options for nodes too, but make the node an OR.. + qn->_options = orig->_options | X_OR; + for (std::vector<QueryTerm*>::iterator it = newterms.begin(); + it != newterms.end(); ++it) + { + qn->AddChild(*it); + } + update(qn); + _mo.add_nonterm(qn); + } + + + // Visit on descent: + void VisitQueryNode(QueryNode* n) + { + QueryNode* qn = new QueryNode(n); + update(qn); + _caller.push(qn); + } + + + // revisit on return: + void RevisitQueryNode(QueryNode* n) + { + QueryNode* qn = _caller.top(); + if (n->_parent) _caller.pop(); + _mo.add_nonterm(qn); + } + + QueryExpr* NewQuery() + { + if (_caller.empty()) return NULL; + return _caller.top(); + } +private: + void update(QueryExpr* e) + { + if (!_caller.empty()) + _caller.top()->AddChild(e); + } + + std::stack<QueryNode*> _caller; // Recursion emulator.. + MatchObject& _mo; + uint32_t _langid; +}; // class query_expander + + + + +MatchObject::MatchObject(QueryExpr* query, bool has_reductions) : + _query(query), + _qt(), + _nonterms(), + _match_overlap(false), _max_arity(0), + _has_reductions(has_reductions), + _qt_byname(), + _reduce_matchers() +{ + LOG(debug, "MatchObject(default)"); + traverser tr(*this); + query->Accept(tr); // Initialize structure for the query + _max_arity = query->MaxArity(); +} + + + +MatchObject::MatchObject(QueryExpr* query, bool has_reductions, uint32_t langid) : + _query(NULL), + _qt(), + _nonterms(), + _match_overlap(false), + _max_arity(0), + _has_reductions(has_reductions), + _qt_byname(), + _reduce_matchers() +{ + LOG(debug, "MatchObject(language %d)", langid); + query_expander qe(*this, langid); + query->Accept(qe); // Create a new, modified query + _query = qe.NewQuery(); // Fetch the new query.. + + if (LOG_WOULD_LOG(debug)) { + std::string s; + _query->Dump(s); + LOG(debug, "juniper::MatchObject(language id %d): modified stack: %s", + langid, s.c_str()); + } + _max_arity = _query->MaxArity(); +} + + + +MatchObject::~MatchObject() +{ + // _query is now always owned by the match object! + delete _query; +} + + +bool MatchObject::Match(MatchObject::iterator& mi, Token& token, unsigned& options) +{ + QueryTerm* q = mi.first_match(token); + if (!q) return false; + options = 0; + q->total_match_cnt++; + if (q->ucs4_len == static_cast<size_t>(token.curlen)) + { + options |= X_EXACT; + q->exact_match_cnt++; + } + return true; +} + + +void MatchObject::add_nonterm(QueryNode* n) +{ + _nonterms.push_back(n); + n->_node_idx = _nonterms.size() - 1; +} + + + +void MatchObject::add_queryterm(QueryTerm* nt) +{ + _qt.push_back(nt); + nt->idx = _qt.size() - 1; + + _qt_byname.Insert( + *(reinterpret_cast<const queryterm_hashtable::keytype*>(nt->ucs4_term())), nt); + + LOG(debug, "MatchObject: adding term '%s'", nt->term()); +} + + +void MatchObject::add_reduction_term(QueryTerm* nt, juniper::Rewriter* rw) +{ + // All terms go here: + _qt.push_back(nt); + nt->idx = _qt.size() - 1; + + LOG(debug, "MatchObject: adding reduction term '%s'", nt->term()); + if (!nt->reduce_matcher) + nt->reduce_matcher = _reduce_matchers.find(rw); + nt->reduce_matcher->add_term(nt); +} + + +match_iterator::match_iterator(MatchObject* mo, Result* rhandle) : + _table(mo->_qt_byname), _el(NULL), _rhandle(rhandle), + _reductions(mo->HasReductions()), _reduce_matches(NULL), _reduce_matches_it(), + _mo(mo), _len(0), _stem_min(rhandle->StemMin()), _stemext(rhandle->StemExt()), + _term(NULL) +{} + + +QueryTerm* match_iterator::first() +{ + for (; _el != NULL; _el = _el->GetNext()) + { + QueryTerm* q = _el->GetItem(); + + // If exact match is desired by this subexpression, + // only have effect if exact match + if (q->Exact() && _len > q->len) continue; + + if (q->is_wildcard()) + { + if (fast::util::wildcard_match(_term, q->ucs4_term()) == false) continue; + return q; + } + + if (_len < q->ucs4_len) continue; + // allow prefix match iff prefix query term or + // rest < _stem_extend and length > stem_min + if (!q->is_prefix()) + { + size_t stem_extend = (q->ucs4_len <= _stem_min ? 0 : _stemext); + if (_len > q->ucs4_len + stem_extend) continue; + } + if (juniper::strncmp(_term, q->ucs4_term(), q->ucs4_len) != 0) continue; + return q; + } + return NULL; +} + + +QueryTerm* match_iterator::next_reduce_match() +{ + if (!_reduce_matches) return NULL; + if (_reduce_matches_it != _reduce_matches->end()) + { + QueryTerm* t = *_reduce_matches_it; + ++_reduce_matches_it; + return t; + } + delete _reduce_matches; + _reduce_matches = NULL; + return NULL; +} + + + +QueryTerm* match_iterator::first_match(Token& token) +{ + const ucs4_t* term = token.token; + size_t len = token.curlen; + + // Check for interlinear annotation, and "lie" to the matchobject + if (*term == 0xFFF9) { + // 0xFFF9 = Interlinear Annotation ANCHOR + // 0xFFFA = Interlinear Annotation SEPARATOR + // 0xFFFB = Interlinear Annotation TERMINATOR + const ucs4_t *terminator = term + len; + token.token = ++term; + // starting annotation, skip to after SEPARATOR + while (term < terminator && *term != 0xFFFA) { + term++; + } + const ucs4_t *separator = term; + // found separator, assume terminator at end + if (term + 2 < terminator) { + token.token = ++term; // skip the SEPARATOR + QueryTerm *qt; + // process until TERMINATOR is found + while (term < terminator && *term != 0xFFFB) { + // Handle multiple terms in the same annotation, for compound nouns or multiple stems + if (*term == ' ' || *term == 0xFFFA) { + token.curlen = term - token.token; + LOG(debug, "recurse A to match token %u..%u len %d", token.token[0], token.token[token.curlen-1], token.curlen); + qt = this->first_match(token); + if (qt != NULL) { + return qt; + } + token.token = ++term; // skip SPACE + } else { + ++term; + } + } + token.curlen = term - token.token; + LOG(debug, "recurse B to match token %u..%u len %d", token.token[0], token.token[token.curlen-1], token.curlen); + return this->first_match(token); + } else { + // broken annotation + // process first part (before SEPARATOR) instead + token.curlen = separator - token.token; + LOG(debug, "recurse C to match token %u..%u len %d", token.token[0], token.token[token.curlen-1], token.curlen); + return this->first_match(token); + } + } else { + // plain token, so just reference the term + _term = token.token; + } + + queryterm_hashtable::keytype termval = *(reinterpret_cast<const queryterm_hashtable::keytype*>(term)); + queryterm_hashtable::keytype keyval = termval; + if (LOG_WOULD_LOG(spam)) { + char utf8term[1024]; + Fast_UnicodeUtil::utf8ncopy(utf8term, term, 1024, (term != NULL ? len : 0)); + LOG(spam, "term %s, len %ld, keyval 0x%x termval 0x%x", + utf8term, len, keyval, termval); + } + _el = _table.FindRef(keyval); + _len = len; + QueryTerm* rtrn = first(); + + if (rtrn == 0) + { + _el = _table.FindRef('*'); + if ((rtrn = first()) == 0) + { + _el = _table.FindRef('?'); + rtrn = first(); + } + } + if (_reductions) + { + _reduce_matches = _mo->_reduce_matchers.match(_rhandle->_langid, + &_rhandle->_docsum[token.bytepos], + token.bytelen); + if (_reduce_matches) + { + _reduce_matches_it = _reduce_matches->begin(); + + // Find the first reduce match only if no other match was found + if (!rtrn) + rtrn = current(); + } + } + return rtrn; +} + + + +/** Return the current element without advancing iterator pointers */ +QueryTerm* match_iterator::current() +{ + if (_el) return _el->GetItem(); + if (!_reduce_matches) return NULL; + if (_reduce_matches_it != _reduce_matches->end()) + { + QueryTerm* t = *_reduce_matches_it; + return t; + } + delete _reduce_matches; + return NULL; +} + + +QueryTerm* match_iterator::next() +{ + if (_el) + { + _el = _el->GetNext(); + return first(); + } + else if (_reduce_matches) + return next_reduce_match(); + return NULL; +} + diff --git a/juniper/src/vespa/juniper/matchobject.h b/juniper/src/vespa/juniper/matchobject.h new file mode 100644 index 00000000000..a97cb7cb6ab --- /dev/null +++ b/juniper/src/vespa/juniper/matchobject.h @@ -0,0 +1,116 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "queryhandle.h" +#include "querynode.h" +#include "hashbase.h" +#include <vespa/fastlib/text/unicodeutil.h> +#include "reducematcher.h" +#include "ITokenProcessor.h" + +typedef juniper::Result Result; +typedef ITokenProcessor::Token Token; + +// Reverse length order, longest match first - needed to allow matcher to +// match on the most explicit matches before the more implicit ones +// Quick hack for setting up matchobject (which depend on (<=) +// +struct QueryTermLengthComparator +{ + inline bool operator()(QueryTerm* m1, QueryTerm* m2) + { + return m1->len <= m2->len; + } +}; + +typedef Fast_HashTable<ucs4_t, QueryTerm*, 0x20, + QueryTermLengthComparator> queryterm_hashtable; + +class match_iterator +{ +public: + match_iterator(MatchObject* mo, Result* rhandle); + QueryTerm* current(); + QueryTerm* next(); + QueryTerm* first_match(Token& token); +private: + QueryTerm* first(); + QueryTerm* next_reduce_match(); + queryterm_hashtable& _table; + queryterm_hashtable::element* _el; +public: + Result* _rhandle; +private: + bool _reductions; + const std::vector<QueryTerm*>* _reduce_matches; + std::vector<QueryTerm*>::const_iterator _reduce_matches_it; + MatchObject* _mo; + size_t _len, _stem_min, _stemext; + const ucs4_t* _term; + + match_iterator(match_iterator &); + match_iterator &operator=(match_iterator &); +}; + + +// MatchObject encapsulate the data structure necessary to map from a query word to a +// unique index + options for this query. +// A MatchObject keeps no state for a particular document +// so it can be reused for later results for +// the same query/language combination. + +class MatchObject +{ +public: + // Constructor for the default match object. + // Resumes ownership of query + MatchObject(QueryExpr* query, bool has_reductions); + + // Constructor for language specific extensions: + // Creates a duplicate of query + MatchObject(QueryExpr* query, bool has_reductions, uint32_t langid); + + ~MatchObject(); + + typedef match_iterator iterator; + + /** Check if the given string matches any query term in the MatchObject + * @param an iterator that will be updated to iterate over all matching query terms + * @param term the term to match + * @param len the length of the term + * @param options tell if match was exact/pre/post etc. + * @return true if a match was found (and the iterator points to the first element) + */ + bool Match(iterator& mi, Token& token, unsigned& options); + + inline QueryTerm* Term(int idx) { return _qt[idx]; } + + inline size_t TermCount() { return _qt.size(); } + inline size_t NontermCount() { return _nonterms.size(); } + inline int MaxArity() { return _max_arity; } + + inline bool HasConstraints() { return (_query ? (_query->_options & X_CONSTR) : false); } + inline bool UsesValid() { return (_query ? (_query->_options & X_CHKVAL) : false); } + + inline QueryExpr* Query() { return _query; } + inline bool HasReductions() { return _has_reductions; } + + // internal use only.. + void add_queryterm(QueryTerm* term); + void add_nonterm(QueryNode* n); + void add_reduction_term(QueryTerm* term, juniper::Rewriter*); +private: + friend class match_iterator; + QueryExpr* _query; + std::vector<QueryTerm*> _qt; // fast lookup by index + std::vector<QueryNode*> _nonterms; + bool _match_overlap; + int _max_arity; + bool _has_reductions; // query contains terms that reqs reduction of tokens before matching + queryterm_hashtable _qt_byname; // fast lookup by name + juniper::ReduceMatcher _reduce_matchers; + + MatchObject(MatchObject &); + MatchObject &operator=(MatchObject &); +}; + diff --git a/juniper/src/vespa/juniper/mcand.cpp b/juniper/src/vespa/juniper/mcand.cpp new file mode 100644 index 00000000000..b8c2434c7a5 --- /dev/null +++ b/juniper/src/vespa/juniper/mcand.cpp @@ -0,0 +1,219 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.mcand"); +#include "mcand.h" +#include "Matcher.h" +#include "juniperdebug.h" +#include <vespa/vespalib/util/stringfmt.h> + +// Invariant: elms has room for query->_arity match element pointers +MatchCandidate::MatchCandidate(QueryExpr* m, MatchElement** elms, off_t ctxt_start) : + MatchElement(0, 0), + element(elms), + _match(m), + _nelems(0), + _elems(std::max(m->_arity, 1)), + _endpos(0), + _endtoken(0), + _docid(0), + _ctxt_start(ctxt_start), + _elem_weight(0), + _options(m->_options), + _overlap(0), + _refcnt(1), + _klist() +{ + for (int i = 0; i < _elems; i++) + element[i] = NULL; + + if (LOG_WOULD_LOG(debug)) { + std::string s; + dump(s); + LOG(debug, "new %s", s.c_str()); + } +} + + +MatchCandidate::~MatchCandidate() +{ + delete[] element; +} + +void MatchCandidate::dump(std::string& s) +{ + int i; + s.append("MC<"); + for (i = 0; i < _elems; i++) + { + if (i > 0) s.append(";"); + _match->AsNode()->_children[i]->Dump(s); + s.append(":"); + if (element[i]) + { + s.append(vespalib::make_string("%" PRId64, + static_cast<int64_t> + (element[i]->starttoken()))); + if (element[i]->starttoken() + 1 < element[i]->endtoken()) + s.append(vespalib::make_string("-%" PRId64, + static_cast<int64_t> + (element[i]->endtoken()))); + } + else + s.append("<nil>"); + } + s.append(">"); +} + + +void MatchCandidate::set_valid() +{ + for (int j = 0; j < _elems; j++) + if (element[j]) + element[j]->set_valid(); + _valid = true; +} + + +void MatchCandidate::make_keylist() +{ + add_to_keylist(_klist); +} + + +void MatchCandidate::add_to_keylist(keylist& kl) +{ + if (kl.size() > 0) return; // already made list + for (int i = 0; i < _elems; i++) + { + MatchElement* me = element[i]; + if (me) me->add_to_keylist(kl); + } +} + + +MatchCandidate::accept_state MatchCandidate::accept(MatchElement* k, QueryExpr* mexp) +{ + if (element[mexp->_childno]) { + if (_overlap) return M_OVERLAP; + return M_EXISTS; + } else { + if (order()) { + // Ensure that overlapping matches are not considered in ordered mode.. + if (k->startpos() < _endpos) { + _overlap++; + return M_OVERLAP; + } else { + _overlap--; // Found overlap.. + } + } + + element[mexp->_childno] = k; + + // Note that in 2.1.x match elements are no longer arriving in position order! + // They may also overlap (because they may be complex candidates themselves) + if (!_nelems || (k->startpos() < _startpos)) { + _startpos = k->startpos(); + _starttoken = k->starttoken(); + } + _nelems++; + + // Update 2.0 term weight/element count/combined element word length + _elem_weight += weight(k, mexp); + + if (!_nelems || (k->endpos() > _endpos)) { + _endpos = k->startpos() + k->length(); + _endtoken = k->starttoken() + k->word_length(); + } + if (LOG_WOULD_LOG(spam)) { + std::string s("(accept:"); k->dump(s); + s.append(") "); dump(s); + LOG(spam, "%s", s.c_str()); + } + return M_OK; + } +} + + +int MatchCandidate::weight(MatchElement* me, QueryExpr* mexp) +{ + QueryTerm* texp = mexp->AsTerm(); + if (texp) return mexp->_weight; + MatchCandidate* m = reinterpret_cast<MatchCandidate*>(me); + return m->weight(); +} + +bool MatchCandidate::complete() +{ + if (_nelems < _elems) return false; + for (int i = 0; i < _elems; i++) + if (!element[i]->complete()) return false; + return true; +} + + +void MatchCandidate::log(std::string& logobj) +{ + char buf[200]; + for (int i = 0; i < _elems; i++) + { + if (element[i]) + { + sprintf(buf, "<td align=left>%" PRId64 "</td>", + static_cast<int64_t>(element[i]->starttoken())); + logobj.append(buf); + } + else + logobj.append("<td></td>"); + } + sprintf(buf, "<td align=right>%d</td><td align=right>%d</td>", word_distance(),rank()); + logobj.append(buf); +} + +// Check optional WITHIN(limit) constraints: +bool MatchCandidate::matches_limit() +{ + if (!match()->HasLimit()) return true; + + // completeness check: + if (!complete()) return false; + + int limit = match()->Limit(); + size_t elem_word_len = element[0]->word_length(); + for (int i = 1; i < _elems; i++) + { + int prev_term = i - 1; + elem_word_len += element[i]->word_length(); + // Order check: + if (order() && element[prev_term]->starttoken() >= element[i]->starttoken()) + return false; + } + + // Then check that within total limit: + if (((int)word_length() - (int)elem_word_len) > limit * (_elems - 1)) return false; + return true; +} + +bool gtematch_cand::gtDistance(const MatchCandidate* m1, const MatchCandidate* m2) const +{ + int m1d(m1->word_distance()), m2d(m2->word_distance()); + return (m1d < m2d) + ? true + : (m1d > m2d) + ? false + : m1->startpos() < m2->startpos(); +} + +// A suitable comparator for MatchCandidates +bool gtematch_cand::operator()(const MatchCandidate* m1, const MatchCandidate* m2) const +{ + // replace return m1->rank() > m2->rank(); + // which does return (_elem_weight << 11) - (word_distance() << 8) - (_startpos >> 8); + return (m1->weight() > m2->weight()) + ? true + : (m1->weight() < m2->weight()) + ? false + : gtDistance(m1, m2); +} diff --git a/juniper/src/vespa/juniper/mcand.h b/juniper/src/vespa/juniper/mcand.h new file mode 100644 index 00000000000..84967391b01 --- /dev/null +++ b/juniper/src/vespa/juniper/mcand.h @@ -0,0 +1,131 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#pragma once + +#include "keyocc.h" +#include <string> +#include "querynode.h" + +class Matcher; + +/** This is the Juniper 1.0 version of MatchCandidate + * To be replaced by matchcand.{h,cpp} + */ + +class MatchCandidate; + +struct gtematch_cand { + bool operator()(const MatchCandidate* m1, const MatchCandidate* m2) const; + bool gtDistance(const MatchCandidate* m1, const MatchCandidate* m2) const; +}; +typedef JUNIPER_MULTISET<MatchCandidate*, gtematch_cand> match_candidate_set; + +class MatchCandidate : public MatchElement +{ +public: + MatchElement** element; + + enum accept_state + { + M_OK, + M_EXISTS, + M_OVERLAP, + M_EXPIRED, + M_MAX + }; +private: + QueryExpr* _match; + int _nelems, _elems; + // _startpos in superclass + off_t _endpos; + off_t _endtoken; + long _docid; + off_t _ctxt_start; + size_t _elem_weight; // Combination of #elements and their weight, normal weight ~ 100 + int _options; + int _overlap; // Handle terms matching multiple elements in ordered (distinct) mode + uint32_t _refcnt; // reference count for this object + + MatchCandidate(MatchCandidate &); + MatchCandidate &operator=(MatchCandidate &); + +public: + keylist _klist; + + MatchCandidate(QueryExpr* query, MatchElement** elms, off_t ctxt_start); + virtual ~MatchCandidate(); + void ref() { ++_refcnt; } + uint32_t deref() { --_refcnt; return _refcnt; } + virtual void set_valid(); + virtual void dump(std::string& s); + + inline int elems() const { return _nelems; } + inline int elem_store_sz() const { return _elems; } + inline int word_distance() const { return _elems ? _endtoken - _starttoken - (_elems - 1) : 0; } + inline off_t ctxt_startpos() const { return _ctxt_start; } + virtual inline off_t endtoken() const { return _endtoken; } + virtual inline off_t endpos() const { return _endpos; } + inline ssize_t size() const { return _endpos - _startpos; } + inline bool order() const { return _options & X_ORDERED; } + inline bool partial_ok() const { return !(_options & X_COMPLETE); } + inline QueryExpr* match() { return _match; } + inline int weight() const { return _elem_weight; } + inline size_t word_length() const { return _endtoken - _starttoken; } + + virtual bool complete(); + int weight(MatchElement* me, QueryExpr* mexp); + + virtual size_t length() const { return _endpos - _startpos; } + + virtual MatchCandidate* Complex() { return this; } + + virtual void add_to_keylist(keylist& kl); + void make_keylist(); + + // A simple ranking function for now: Make sure those matches with + // more keywords present gets ranked higher even if distance is + // higher. + // + // Equal distance matches with similar amount of words should be ranked + // according to how early in document they are. + // + // Rank function criterias: + // 1. number and weight of elements in match + // 2. distance in bytes between the elements (excluding the elements itself) + // 3. significans of elements wrt. query order - order preserval [TBD] + // 4. position in document + // + // Note that for start positions > 64K, each 256 byte further out in document + // equals a distance increase by 8 bytes. + // + // also note that (for Juniper 1.0.x) + // a distance increase of 512 bytes yields the effect of having one less + // element in the match. + // + // Note (Juniper 2.0.x) + // A normal keyword weight is assumed to be 100, with accepted range from 0 to 100000 + // Typical absolute values of the rank metric then becomes higher in 2.0 than in 1.0.x + // and this also boosts the effect of having more keywords with significant weights + // relative to the distance: a weight increase of 1 point on a single term now + // equals a 16-byte distance, while a 100 byte weight increase (typical term addition) + // equals 1600 bytes of distance increase. + // + inline int rank() const + { +#ifdef JUNIPER_1_0_RANK + // Just kept this here for reference.. + return (_nelems << 14) - ((_distance & ~0x7) << 5) - (_startpos >> 8); +#else + return (_elem_weight << 11) - (word_distance() << 8) - (_startpos >> 8); +#endif + } + + accept_state accept(MatchElement* k, QueryExpr* match); + + // Check optional WITHIN(limit) constraints: + bool matches_limit(); + void log(std::string& logobj); + void SetDocid(long id) { _docid = id; } +}; + diff --git a/juniper/src/vespa/juniper/multiset.h b/juniper/src/vespa/juniper/multiset.h new file mode 100644 index 00000000000..a50509e1660 --- /dev/null +++ b/juniper/src/vespa/juniper/multiset.h @@ -0,0 +1,81 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once +#include <algorithm> +#include <vector> +#include <cstddef> + +#ifdef __GNUC__ +#define USE_STL_WORKAROUNDS 1 +#endif + +#ifdef USE_STL_WORKAROUNDS + +namespace fast { + +// STL like wrapper around Fast_Array providing multiset functionality + + +template <class ValueType, class Comparator> +class multiset +{ +public: + + class iterator + : public std::iterator<std::bidirectional_iterator_tag, ValueType, + ptrdiff_t, ValueType*, ValueType&> + { + public: + iterator(multiset<ValueType, Comparator>& mset, int pos) : _myset(mset), _pos(pos) {} + inline ValueType operator*() { return _myset._values[_pos]; } + inline iterator& operator++() { _pos++; return *this; } + inline iterator& operator--() { _pos--; return *this; } + inline bool operator!=(const iterator& i2) { return i2._pos != _pos; } + inline bool operator==(const iterator& i2) { return i2._pos == _pos; } + protected: + friend class multiset; + const multiset<ValueType, Comparator>& _myset; + int _pos; + }; + + inline multiset() : _values(), _sorted(true) {} + + inline bool insert(ValueType& v) + { + _sorted = false; + _values.push_back(v); + return true; + } + + inline void clear() { _values.clear(); _sorted = true; } + + inline int size() const { return _values.size(); } + + iterator begin() { sort(); return iterator(*this, 0); } + iterator end() { return iterator(*this, size()); } + +protected: + inline void sort() + { + if (!_sorted) { + std::stable_sort(_values.begin(), _values.end(), Comparator()); + _sorted = true; + } + } + +private: + friend class iterator; + std::vector<ValueType> _values; + bool _sorted; +}; // end class multiset + +} // end namespace fast + +#define JUNIPER_MULTISET fast::multiset +#define JUNIPER_SET fast::multiset + +#else +#include <set> +#define JUNIPER_MULTISET std::multiset +#define JUNIPER_SET std::set +#endif + diff --git a/juniper/src/vespa/juniper/propreader.cpp b/juniper/src/vespa/juniper/propreader.cpp new file mode 100644 index 00000000000..3797e0d02b7 --- /dev/null +++ b/juniper/src/vespa/juniper/propreader.cpp @@ -0,0 +1,98 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * Simple property reader (format a la fsearchrc) + */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.propreader"); +#include "propreader.h" +#include <vespa/fastlib/io/bufferedfile.h> +#include "juniperdebug.h" + +PropReader::PropReader(const char* filename) + : _keymap() +{ + Process(filename); +} + +#define BUFLEN 1024 + + +void PropReader::Process(const char* filename) +{ + Fast_BufferedFile propfile; + propfile.ReadOpen(filename); + if (!propfile.IsOpened()) + { + LOG(warning, "Warning: Could not find property file '%s', using Juniper default values", + filename); + return; + } + char line[BUFLEN]; + char* linep; + while ((linep = propfile.ReadLine(line, BUFLEN-1)) != NULL) + { + int i; + char* key; + if (line[0] == '#') continue; // skip comments + + // find key + for (i = 0; !isspace(line[i]); i++) { } + if (i == 0) continue; // Skip lines starting with blank + line[i++] = 0; + key = line; + + for (; isspace(line[i]); i++) { } // Skip blanks + + // find value + int offset = 0; + char* value = &line[i]; + for (; !isspace(line[i]); i++) + { + if (line[i] == '\\') + { + offset++; + if (line[++i] == 'x') + { + unsigned char v = 0; + for (int s = 1; s <= 2; s++, v<<=4) + { + unsigned char c = static_cast<unsigned char>(line[i + s]); + if (isdigit(c)) + v += (c - '0'); + else if (c < 'a') + v += (c - 'A' + 10); + else + v += (c - 'a' + 10); + if (s == 2) break; + } + line[i - offset] = static_cast<char>(v); + i += 2; + offset += 2; + } + else + if (offset != 0) line[i - offset] = line[i]; + } + else + if (offset != 0) line[i - offset] = line[i]; + } + line[i - offset] = 0; + LOG(debug, "Parameter :%s: value :%s:", key, value); + _keymap.Insert(key, value); + } +} + + +const char* PropReader::GetProperty(const char* name, const char* def) +{ + const char* v = _keymap.Lookup(name, def); + LOG(debug, "Parameter lookup :%s: value :%s:", name, v); + return v; +} + + +void PropReader::UpdateProperty(const char* name, const char* value) +{ + _keymap.Insert(name, value); +} diff --git a/juniper/src/vespa/juniper/propreader.h b/juniper/src/vespa/juniper/propreader.h new file mode 100644 index 00000000000..dfc0a92e31f --- /dev/null +++ b/juniper/src/vespa/juniper/propreader.h @@ -0,0 +1,22 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "IJuniperProperties.h" +#include "stringmap.h" + +/** Simple property reader using same format as fsearchrc. + * Implemented for standalone testing of Juniper. + */ +class PropReader : public IJuniperProperties +{ +public: + PropReader(const char* filename); + virtual const char* GetProperty(const char* name, const char* def = NULL); + void UpdateProperty(const char* name, const char* value); + virtual ~PropReader() {} +protected: + void Process(const char* filename); +private: + Fast_StringMap _keymap; +}; + diff --git a/juniper/src/vespa/juniper/query.h b/juniper/src/vespa/juniper/query.h new file mode 100644 index 00000000000..e4e2d7c3ee8 --- /dev/null +++ b/juniper/src/vespa/juniper/query.h @@ -0,0 +1,199 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#pragma once + +#ifndef JUNIPER_RPIF +#define JUNIPER_RPIF 1 +#endif + +#include <vespa/fastos/fastos.h> + +/** @file query.h + * This file describes describes Juniper's expected interface for + * advanced query processing. Clients of Juniper wishing to receive optimal + * teasers based on the original query should use this interface. + * Design principle: visitor pattern - adapted to allow minimal overhead + * and opaque implementation (at binary level) of the IQueryItem class. + * Query provider (such as fsearch/qrserver/DS doc.proc.pipeline) should implement + * IQuery such that the appropriate Visit* functions of IQueryVisitor gets called + * Proper IQueryVisitor instance(s) is implemented by Juniper. + * + * Note that Juniper v.1.0.x also provides a more low level simple query + * interface through SimpleDynamicSummary. This interface only supports a query on + * the (abstract) form ((phrase|keyword) OR)* by means of a + * single string of null or space separated words and is depreciated as of Juniper v.2.0.x + */ + +namespace juniper +{ + +enum ItemCreator +{ + CREA_ORIG = 0, // Original user query + CREA_FILTER // Automatically applied filter (no specific type) +}; + + +// For debugging purposes: return a text string with the creator enum name +const char* creator_text(ItemCreator); + +class IQueryVisitor; + +/** Opaque datatype implemented by provider + */ +struct QueryItem; + +/** This is the basic query type, implemented by the query provider + */ +class IQuery +{ +public: + virtual ~IQuery() {} + + /** Traverse the query. + * This will lead to a prefix depth first traversal of the complete query + * and calls to the appropriate Visitor functions. + */ + virtual bool Traverse(IQueryVisitor* v) const = 0; + + /** @param item A query item to check + * @return A weight assigned to this term (default 100 (%) ) + */ + virtual int Weight(const QueryItem* item) const = 0; + + /** @param item A query item to check + * @return The creator module associated with this term + */ + virtual ItemCreator Creator(const QueryItem* item) const = 0; + + /** Return a text string representing any index specification used for this term + * @param item A query item to check + * @param length the length of the returned string + * @return A text containing the name of the index associated with this term + */ + virtual const char* Index(const QueryItem* item, size_t* length) const = 0; + + /** Check if the index specification associated with the query item is useful from + * a Juniper perspective (see fsearchrc, highlightindexes parameter) + * @param item A query item to check + * @return true if this index is valid for Juniper, false otherwise + */ + virtual bool UsefulIndex(const QueryItem* item) const = 0; +}; + + +/** IQueryVisitor is implemented by Juniper to enable Juniper to traverse the + * structure of an input query (Visitor pattern) + */ +class IQueryVisitor +{ +public: + + /** To be called upon by IQuery::Traverse visiting an AND query item + * @param item The (opaque to IQueryVisitor) item that is visited + * @param arity The number of children of this item + * @return if false, caller should skip calling this element's children visitors, + * otherwise caller should proceed as normal + */ + virtual bool VisitAND(const QueryItem* item, int arity) = 0; + + /** To be called upon by IQuery::Traverse visiting an OR query item + * @param item The (opaque to IQueryVisitor) item that is visited + * @param arity The number of children of this item + * @return if false, caller should skip calling this element's children visitors, + * otherwise caller should proceed as normal + */ + virtual bool VisitOR(const QueryItem* item, int arity) = 0; + + /** To be called upon by IQuery::Traverse visiting an AND query item + * @param item The (opaque to IQueryVisitor) item that is visited + * @param arity The number of children of this item + * @return if false, caller should skip calling this element's children visitors, + * otherwise caller should proceed as normal + */ + virtual bool VisitANY(const QueryItem* item, int arity) = 0; + + /** To be called upon by IQuery::Traverse visiting a NEAR query item + * @param item The (opaque to IQueryVisitor) item that is visited + * @param arity The number of children of this item + * @param limit The number of words that defines the nearness wanted + * @return if false, caller should skip calling this element's children visitors, + * otherwise caller should proceed as normal + */ + virtual bool VisitNEAR(const QueryItem* item, int arity, int limit) = 0; + + /** To be called upon by IQuery::Traverse visiting a WITHIN query item + * @param item The (opaque to IQueryVisitor) item that is visited + * @param arity The number of children of this item + * @param limit The number of words that defines the nearness wanted + * @return if false, caller should skip calling this element's children visitors, + * otherwise caller should proceed as normal + */ + virtual bool VisitWITHIN(const QueryItem* item, int arity, int limit) = 0; + + /** To be called upon by IQuery::Traverse visiting a RANK query item + * @param item The (opaque to IQueryVisitor) item that is visited + * @param arity The number of children of this item + * @return if false, caller should skip calling this element's children visitors, + * otherwise caller should proceed as normal + */ + virtual bool VisitRANK(const QueryItem* item, int arity) = 0; + + /** To be called upon by IQuery::Traverse visiting a PHRASE query item + * @param item The (opaque to IQueryVisitor) item that is visited + * @param arity The number of children of this item + * @return if false, caller should skip calling this element's children visitors, + * otherwise caller should proceed as normal + */ + virtual bool VisitPHRASE(const QueryItem* item, int arity) = 0; + + /** To be called upon by IQuery::Traverse visiting an ANDNOT query item + * @param item The (opaque to IQueryVisitor) item that is visited + * @param arity The number of children of this item + * @return if false, caller should skip calling this element's children visitors, + * otherwise caller should proceed as normal + */ + virtual bool VisitANDNOT(const QueryItem* item, int arity) = 0; + + /** To be called upon by IQuery::Traverse visiting a THRESHOLD query item + * @param item The (opaque to IQueryVisitor) item that is visited + * @param arity The number of children of this item + * @param threshold The threshold value for which the sum of the individual + * subexpressions' weights (as obtained by the IQueryVisitor::Weight function + * should result in a hit for the THRESHOLD expression. + * @return if false, caller should skip calling this element's children visitors, + * otherwise caller should proceed as normal + */ + virtual bool VisitTHRESHOLD(const QueryItem* item, int arity, int threshold) = 0; + + /** To be called upon by IQuery::Traverse visiting any other query item + * than the ones handled by Juniper (to avoid inconsistency in the + * traversal wrt. arities) + * @param item The (opaque to IQueryVisitor) item that is visited + * @param arity The number of children of this item (may be 0 if leaf node) + * @return typically false to denote that caller should skip calling this + * element's children visitors, + */ + virtual bool VisitOther(const QueryItem* item, int arity) = 0; + + /** Visit callback for the terminal type, to be called by IQuery::Traverse + * when encountering individual keywords + * @param item The (opaque to IQueryVisitor) item that is visited + * @param keyword Textual representation of the query keyword in question + * @param length Length of the keyword. If 0, it means keyword length is defined by + * null termination + * @param prefix true if prefix match with this term is desired + * otherwise caller should proceed as normal + * @param specialToken true if this term is treated as a special token + */ + virtual void VisitKeyword(const QueryItem* item, + const char* keyword, const size_t length = 0, + bool prefix = false, bool specialToken = false) = 0; + + virtual ~IQueryVisitor() {}; +}; + + +} // end namespace juniper + diff --git a/juniper/src/vespa/juniper/queryhandle.cpp b/juniper/src/vespa/juniper/queryhandle.cpp new file mode 100644 index 00000000000..67a4fe1ced5 --- /dev/null +++ b/juniper/src/vespa/juniper/queryhandle.cpp @@ -0,0 +1,213 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.queryhandle"); +#include "query.h" +#include "queryhandle.h" +#include "juniperdebug.h" +#include "Matcher.h" +#include "queryparser.h" +#include "querymodifier.h" + +namespace juniper +{ + +QueryHandle::QueryHandle(const IQuery& fquery, const char* options, QueryModifier & modifier) : + _mo(NULL), + _privileged_port(false), + _dynsum_len(-1), + _max_matches(-1), + _surround_max(-1), + _stem_extend(-1), + _stem_min(-1), + _winsize(-1), + _winsize_fallback_multiplier(-1), + _max_match_candidates(-1), + _querytext(""), + _expansion_cache(NULL), + _log_mask(0), _options(0), _limit(0), _has_expansions(false), + _has_reductions(false) +{ + QueryVisitor* vis; + + /* Parse the options parameter structure, the parameter + * will be invalid later on so results must be stored here + */ + parse_parameters(options); + + /* Then parse the original query */ + if (_querytext.size() > 0) { + // override the fastserver query stack with the query.xxx stack value: + QueryParser q(_querytext.c_str()); + LOG(debug, "Using juniper specific query '%s'", _querytext.c_str()); + vis = new QueryVisitor(q, this, modifier); + } else { + vis = new QueryVisitor(fquery, this, modifier); + } + + QueryExpr* query = vis->GetQuery(); + if (query) { + if (LOG_WOULD_LOG(debug)) { + std::string s; + query->Dump(s); + LOG(debug, "juniper::QueryHandle: stack dump: %s", s.c_str()); + } + + /* The default match object keeps a set of "compiled" data for the + * original query (no language dependent expansion or rewriting (reduction) + * applied) + */ + _mo = new MatchObject(query, _has_reductions); + if (_has_expansions) { // set by query visitor... + _expansion_cache = new ExpansionCache(_mo); + } + } + else { + LOG(debug, "juniper::QueryHandle: stack dump: (no stack)"); + } + + delete vis; +} + +QueryHandle::~QueryHandle() +{ + LOG(debug, "juniper: Deleting query handle"); + if (_expansion_cache) delete _expansion_cache; + if (_mo) delete _mo; +} + + +MatchObject* QueryHandle::MatchObj(uint32_t langid) +{ + if (!_expansion_cache || (int)langid < 0) return _mo; + return _expansion_cache->Lookup(langid); +} + + +void QueryHandle::SetExpansions() +{ + _has_expansions = true; +} + + +void QueryHandle::SetReductions() +{ + _has_reductions = true; +} + + +// small utility +std::string fetchtext(char* cur, char** next) +{ + *next = cur; + while (**next != '\0' && **next != '_') (*next)++; + std::string t(cur, *next); + return t; +} + + +void QueryHandle::parse_parameters(const char* options) +{ + if (!options) return; + char* p = const_cast<char*>(options); + + LOG(debug, "juniper parameter string '%s'", options); + + // Initially check for a privileged option enable from QR server: + if (strncmp(p, "priv.", 5) == 0) + { + p += 5; + SetPrivileged((strtol(p, &p, 10) > 0)); + } + + // options contains a juniperoptions command string, parse it + for (;*p != '\0';) + { + if (strncmp(p, "dynlength.", 10) == 0) + { + p += 10; + _dynsum_len = strtol(p, &p, 0); + LOG(debug, "Setting dynsum.length to %d", _dynsum_len); + } + else if (strncmp(p, "dynmatches.", 11) == 0) + { + p += 11; + _max_matches = strtol(p, &p, 0); + } + else if (strncmp(p, "dynsurmax.", 10) == 0) + { + p += 10; + _surround_max = strtol(p, &p, 0); + } + else if (strncmp(p, "query.", 6) == 0) + { + p += 6; + _querytext = fetchtext(p, &p); + } + else if (strncmp(p, "near.", 5) == 0) + { + p += 5; + _limit = strtoul(p, &p, 0); + _options |= X_LIMIT | X_COMPLETE | X_CONSTR | X_CHKVAL; + LOG(debug, "juniper parameter: Setting NEAR(%d)", _limit); + } + else if (strncmp(p, "within.", 7) == 0) + { + p += 7; + _limit = strtoul(p, &p, 0); + _options |= X_LIMIT | X_ORDERED | X_COMPLETE | X_CONSTR | X_CHKVAL; + LOG(debug, "juniper parameter: Setting WITHIN(%d)", _limit); + } + else if (strncmp(p, "onear.", 6) == 0) + { + p += 6; + _limit = strtoul(p, &p, 0); + _options |= X_LIMIT | X_ORDERED | X_COMPLETE | X_CONSTR | X_CHKVAL; + LOG(debug, "juniper parameter: Setting ONEAR(%d)", _limit); + } + else if (strncmp(p, "stemext.", 8) == 0) + { + p += 8; + _stem_extend = strtoul(p, &p, 0); + } + else if (strncmp(p, "stemmin.", 8) == 0) + { + p += 8; + _stem_min = strtoul(p, &p, 0); + } + else if (strncmp(p, "winsize.", 8) == 0) + { + p += 8; + _winsize = strtoul(p, &p, 0); + } + else if (strncmp(p, "winsize_fallback_multiplier.", 28) == 0) + { + p += 28; + _winsize_fallback_multiplier = strtoul(p, &p, 0); + } + else if (strncmp(p, "max_match_candidates.", 21) == 0) + { + p += 21; + _max_match_candidates = strtoul(p, &p, 0); + } + else if (Privileged()) + { + if (strncmp(p, "log.", 4) == 0) + { + p += 4; + SetLog(strtol(p, &p, 0)); + } + else if (strncmp(p, "debug.", 6) == 0) + { + p += 6; + juniper::SetDebug(strtol(p, &p, 0)); + } + } + while (*p != '\0' && *p != '_') p++; + if (*p == '_') p++; + } +} // end parse_parameters + +} // end juniper namespace diff --git a/juniper/src/vespa/juniper/queryhandle.h b/juniper/src/vespa/juniper/queryhandle.h new file mode 100644 index 00000000000..f7f04d9e06f --- /dev/null +++ b/juniper/src/vespa/juniper/queryhandle.h @@ -0,0 +1,82 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#pragma once + +/* Juniper internal interface */ + +class Matcher; +class MatchObject; + +#include <vector> +#include "queryvisitor.h" +#include "expcache.h" + +typedef std::vector<QueryTerm*> queryterm_vector; +typedef std::vector<QueryNode*> querynode_vector; + +namespace juniper +{ + +/** Juniper internal definition of the query handle + * The query handle keeps a (default) match object for that query + * and possibly a set of additional match objects for expanded queries + * based on available expanders. + * Which matchobject to use for a result is then determined + * by the language ID. + */ + +class QueryHandle +{ +public: + QueryHandle(const IQuery& fquery, const char* options, QueryModifier & modifier); + ~QueryHandle(); + + void SetSimpleQuery(Matcher* m); + inline void SetPrivileged(bool priv) { _privileged_port = priv; } + inline bool Privileged() { return _privileged_port; } + inline void SetLog(uint32_t mask) { _log_mask = mask; } + + /** Find the currect match object to use for this language and query */ + MatchObject* MatchObj(uint32_t langid); + + /** Inform handle that there are expansions */ + void SetExpansions(); + /** Inform handle that there are reductions */ + void SetReductions(); +protected: + void parse_parameters(const char* options); +private: + MatchObject* _mo; // The default MatcherObject + bool _privileged_port; + + QueryHandle(QueryHandle &); + QueryHandle &operator=(QueryHandle &); +public: + // optional per query parameter override settings + // (default (-1) means use configured value, other value forces override) + int _dynsum_len; + int _max_matches; + int _surround_max; + int _stem_extend; + int _stem_min; + int64_t _winsize; + double _winsize_fallback_multiplier; + int64_t _max_match_candidates; + std::string _querytext; // an optional query string to use to override the input query + ExpansionCache* _expansion_cache; + + // parameter settings that are taken directly from + // this handle (eg. not overrides for config settings) + uint32_t _log_mask; + int _options; // query constraint bitmap as defined in querynode.h + int _limit; // WITHIN/NEAR limit by parameter + bool _has_expansions; // If set, the query must be replaced by a language dependent expansion (?) + bool _has_reductions; +}; + +void SetDebug(unsigned int mask); + +} // end namespace juniper + + diff --git a/juniper/src/vespa/juniper/querymodifier.cpp b/juniper/src/vespa/juniper/querymodifier.cpp new file mode 100644 index 00000000000..b5312d6fc32 --- /dev/null +++ b/juniper/src/vespa/juniper/querymodifier.cpp @@ -0,0 +1,65 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.querymodifier"); +#include "juniperdebug.h" +#include "querymodifier.h" +#include "foreach_utils.h" +#include "querynode.h" + + +namespace juniper +{ + +Rewriter::Rewriter(IRewriter* rewriter, bool for_query, bool for_document) + : _rewriter(rewriter), _for_query(for_query), _for_document(for_document) +{ + LOG(debug, "Creating Rewriter (%s %s)", + (for_query ? "query" : ""), (for_document ? "document" : "")); +} + + +QueryModifier::QueryModifier() + : _rewriters(), _has_expanders(false), _has_reducers(false) +{ } + +QueryModifier::~QueryModifier() +{ + FlushRewriters(); +} + + +void QueryModifier::FlushRewriters() +{ + // Delete all Rewriter objects + _rewriters.delete_second(); + _rewriters.clear(); +} + + +/* See rewriter.h for doc */ +void QueryModifier::AddRewriter(const char* index_name, IRewriter* rewriter, + bool for_query, bool for_document) +{ + if (for_query || for_document) + _rewriters.insert(index_name, new Rewriter(rewriter, for_query, for_document)); + if (for_query) _has_expanders = true; + if (for_document) _has_reducers = true; +} + + +/* Return any configured reducer/expander for the index, if any */ +Rewriter* QueryModifier::FindRewriter(const char* index_name) +{ + return _rewriters.find(index_name); +} + + +Rewriter* QueryModifier::FindRewriter(const char* index_name, const size_t length) +{ + std::string idx_name(index_name, length); + return _rewriters.find(index_name); +} + + +} // end namespace juniper diff --git a/juniper/src/vespa/juniper/querymodifier.h b/juniper/src/vespa/juniper/querymodifier.h new file mode 100644 index 00000000000..0f28987be40 --- /dev/null +++ b/juniper/src/vespa/juniper/querymodifier.h @@ -0,0 +1,79 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "simplemap.h" +#include "query.h" +#include "rewriter.h" +#include <string> +#include <vector> + +class QueryTerm; +class QueryExpr; +class MatchObject; + +// This module encapsulates the preinitialized data structure for handling +// query or document rewriting. Eg. it is configured based on AddRewriter calls +// (See external header rewriter.h) and used until system shutdown.. + +// Note that per query state (reducematcher.h, expcache.h) and per hit state +// (Matcher.h) is maintained elsewhere.. + +namespace juniper +{ + +class string_matcher; +class QueryModifier; + +// Wrapper around supplied IRewriter that in addition offer +// the way it is configured in the system +class Rewriter +{ +public: + Rewriter(IRewriter* rewriter, bool for_query, bool for_document); + inline bool ForQuery() { return _for_query; } + inline bool ForDocument() { return _for_document; } + inline RewriteHandle* Rewrite(uint32_t langid, const char* term) + { return _rewriter->Rewrite(langid, term); } + inline RewriteHandle* Rewrite(uint32_t langid, const char* term, size_t len) + { return _rewriter->Rewrite(langid, term, len); } + inline const char* NextTerm(RewriteHandle* exp, size_t& length) + { return _rewriter->NextTerm(exp, length); } +private: + IRewriter* _rewriter; + bool _for_query; + bool _for_document; + + Rewriter(Rewriter &); + Rewriter &operator=(Rewriter &); +}; + + +class QueryModifier +{ +public: + QueryModifier(); + virtual ~QueryModifier(); + + /** See rewriter.h for doc */ + void AddRewriter(const char* index_name, IRewriter* rewriter, + bool for_query, bool for_document); + + inline bool HasExpanders() { return _has_expanders; } + inline bool HasReducers() { return _has_reducers; } + inline bool HasRewriters() { return _has_expanders || _has_reducers; } + + /* Return any configured reducer/expander for the index, if any */ + Rewriter* FindRewriter(const char* index_name); + Rewriter* FindRewriter(const char* index_name, const size_t length); + + /* Delete/dereference all rewriters (needed for testing/debugging) */ + void FlushRewriters(); +private: + simplemap<std::string, Rewriter*> _rewriters; + bool _has_expanders; + bool _has_reducers; +}; + + +} // end namespace juniper + diff --git a/juniper/src/vespa/juniper/querynode.cpp b/juniper/src/vespa/juniper/querynode.cpp new file mode 100644 index 00000000000..75c47f5479b --- /dev/null +++ b/juniper/src/vespa/juniper/querynode.cpp @@ -0,0 +1,361 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.querynode"); +#include "querynode.h" +#include "queryvisitor.h" +#include "juniperdebug.h" +#include <vespa/vespalib/util/stringfmt.h> + + + + +/** Implementation of the internal query data structure used by the matching engine + * in Matcher.h + */ + + +QueryExpr::QueryExpr(int weight, int arity) : + _options(0), _weight(weight), _arity(arity), _parent(NULL), _childno(0) +{} + +QueryExpr::QueryExpr(QueryExpr* e) : + _options(e->_options), + _weight(e->_weight), + _arity(e->_arity), + _parent(NULL), + _childno(0) +{} + +QueryExpr::~QueryExpr() +{} + + +QueryTerm::QueryTerm(const char* t, int length, int ix, int wgt) + : QueryExpr(wgt, 0), len(length), + ucs4_len(0), + total_match_cnt(0), exact_match_cnt(0), + idx(ix), rewriter(NULL), reduce_matcher(NULL), _rep(NULL), + _ucs4_term(NULL) +{ + if (len <= 0) + len = strlen(t); + _rep = new char[len+1]; + strncpy(_rep, t, len); _rep[len] = '\0'; + _ucs4_term = new ucs4_t[len+1]; + Fast_UnicodeUtil::ucs4copy(_ucs4_term, _rep); + ucs4_len = Fast_UnicodeUtil::ucs4strlen(_ucs4_term); +} + + +QueryTerm::QueryTerm(QueryTerm* t) + : QueryExpr(t), len(t->len), + ucs4_len(0), + total_match_cnt(0), exact_match_cnt(0), + idx(-1), rewriter(NULL), reduce_matcher(NULL), _rep(NULL), + _ucs4_term(NULL) +{ + _rep = new char[len+1]; + strncpy(_rep, t->term(), len); _rep[len] = '\0'; + _ucs4_term = new ucs4_t[len+1]; + Fast_UnicodeUtil::ucs4copy(_ucs4_term, _rep); + ucs4_len = Fast_UnicodeUtil::ucs4strlen(_ucs4_term); +} + + + +QueryTerm::~QueryTerm() +{ + delete[] _rep; + delete[] _ucs4_term; +} + + +QueryNode::QueryNode(int arity, int threshold, int weight) : + QueryExpr(weight, arity), _threshold(threshold), _limit(0), + _children(NULL), + _nchild(0), _node_idx(-1) +{ + assert(arity > 0); + _children = new QueryExpr*[arity]; +} + + +QueryNode::QueryNode(QueryNode* n) + : QueryExpr(n), + _threshold(n->_threshold), + _limit(n->_limit), + _children(NULL), + _nchild(0), + _node_idx(n->_node_idx) +{ + _children = new QueryExpr*[_arity]; +} + + + +QueryNode::~QueryNode() +{ + for (int i = 0; i < _nchild; i++) + delete _children[i]; + delete[] _children; + _nchild = 0; +} + + +int QueryNode::Limit() { return _options & X_LIMIT ? _limit : -1; } +int QueryTerm::Limit() { return 0; } + + +QueryNode* QueryTerm::AddChild(QueryExpr*) +{ + LOG(warning, "stack inconsistency, attempt to add children to a terminal node"); + + QueryNode* node = _parent; + while (node && node->Complete()) node = node->_parent; + return node; +} + + +QueryNode* QueryNode::AddChild(QueryExpr* child) +{ + if (!child) + _arity--; + else + { + child->_parent = this; + child->_childno = _nchild; + _children[_nchild++] = child; + if (child->_arity > 0) // we know this is a QueryNode from the arity info + return static_cast<QueryNode*>(child); + } + QueryNode* node = this; + while (node && node->Complete()) node = node->_parent; + return node; +} + + +void QueryExpr::ComputeThreshold() {} + + +// Compute threshold and constraint info + +void QueryNode::ComputeThreshold() +{ + bool no_threshold = false; + int th = 0; + if (_options & (X_OR|X_ANY)) + th = 0xfffffff; + else if (!(_options & X_AND)) + no_threshold = true; + + for (int i = 0; i < _nchild; i++) + { + QueryExpr* qe = _children[i]; + qe->ComputeThreshold(); + if (!no_threshold) + { + int w = qe->_weight; + if (_options | X_AND) th += w; + else + th = std::min(th, w); + } + // Propagate any X_CONSTR and X_CHKVAL bit upwards + _options |= (qe->_options & (X_CONSTR | X_CHKVAL)); + } + if ((!no_threshold) && _threshold < 0) + _threshold = th; +} + + +void QueryTerm::Dump(std::string& out) +{ + out.append(term()); + out.append(vespalib::make_string("%s:%d", (_options & X_PREFIX ? "*" : ""), _weight)); +} + + +void QueryNode::Dump(std::string& out) +{ + out.append(vespalib::make_string("Node<a:%d", _arity)); + if (_options & X_ORDERED) out.append(",o"); + if (_options & X_NOT) out.append("!"); + if (_options & X_LIMIT) + out.append(vespalib::make_string(",l:%d", _limit)); + if (_options & X_EXACT) out.append(",e"); + if (_options & X_CHKVAL) out.append(",v"); + else if (_options & X_CONSTR) out.append(",z"); + if (_options & X_COMPLETE) out.append(",c"); + out.append(">["); + for (int i = 0; i < _nchild; i++) + { + if (i < _nchild && i > 0) out.append(","); + _children[i]->Dump(out); + } + out.append("]"); +} + + +bool QueryNode::StackComplete() +{ + // Stack is complete if rightmost nodes in tree are complete + return (Complete() && (!_arity || _children[_arity - 1]->StackComplete())); +} + + +bool QueryTerm::StackComplete() +{ + return true; +} + + +QueryNode* QueryNode::AsNode() +{ + return this; +} + +QueryNode* QueryTerm::AsNode() +{ + return NULL; +} + +QueryTerm* QueryNode::AsTerm() +{ + return NULL; +} + +QueryTerm* QueryTerm::AsTerm() +{ + return this; +} + +bool QueryTerm::Complex() +{ + return false; +} + +bool QueryNode::Complex() +{ + for (int i = 0; i < _nchild; i++) + { + if (_children[i]->_arity > 1) return true; + } + return false; +} + + +int QueryNode::MaxArity() +{ + int max_arity = _arity; + for (int i = 0; i < _nchild; i++) + { + int ma = _children[i]->MaxArity(); + if (ma > max_arity) max_arity = ma; + } + return max_arity; +} + + +bool QueryNode::AcceptsInitially(QueryExpr* n) +{ + assert(n->_parent == this); +// return (!(_options & X_ORDERED)) || n->_childno == 0; + // currently implicitly add all terms even for ordered.. + (void) n; + return true; +} + + +/** Modify the given stack by eliminating unnecessary internal nodes + * with arity 1 or non-terms with arity 0 + */ +void SimplifyStack(QueryExpr*& orig_stack) +{ + if (!orig_stack) return; + QueryNode* node = orig_stack->AsNode(); + if (!node) return; // Leaf node - no simplifications possible + + int compact = 0; + int i; + if (!node->Complete()) + { + LOG(warning, "juniper: query stack incomplete, got arity %d, expected %d", + node->_nchild, node->_arity); + delete node; + orig_stack = NULL; + return; + } + + for (i = 0; i < node->_arity; i++) + { + if (i > 0 && (node->_options & X_ONLY_1)) + { + // Get rid of children # >2 for RANK/ANDNOT + delete node->_children[i]; + node->_children[i] = NULL; + } + else + SimplifyStack(node->_children[i]); + + if (node->_children[i] == NULL) + compact++; + } + if (compact > 0) + { + node->_nchild = 0; + for (i = 0; i < node->_arity; i++) + { + if (node->_children[i]) + { + if (i > node->_nchild) + { + // shift remaining nodes down - remember to update _childno for each node.. + node->_children[node->_nchild] = node->_children[i]; + node->_children[i]->_childno = node->_nchild; + } + node->_nchild++; + } + } + assert(node->_arity == node->_nchild + compact); + node->_arity = node->_nchild; + } + + if (node->_arity <= 1) + { + QueryExpr* ret = NULL; + if (node->_arity == 1) + { + ret = node->_children[0]; + node->_children[0] = NULL; + ret->_parent = node->_parent; + ret->_childno = node->_childno; + } + delete node; + orig_stack = ret; + } +} + + +// default implementation of 2nd visit to QueryNode objs: +void IQueryExprVisitor::RevisitQueryNode(QueryNode*) +{ } + + +// visitor pattern: + +void QueryTerm::Accept(IQueryExprVisitor& v) +{ + v.VisitQueryTerm(this); +} + + +void QueryNode::Accept(IQueryExprVisitor& v) +{ + int i; + v.VisitQueryNode(this); + for (i = 0; i < _arity; i++) + _children[i]->Accept(v); + v.RevisitQueryNode(this); +} diff --git a/juniper/src/vespa/juniper/querynode.h b/juniper/src/vespa/juniper/querynode.h new file mode 100644 index 00000000000..84e7e800c8e --- /dev/null +++ b/juniper/src/vespa/juniper/querynode.h @@ -0,0 +1,186 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#pragma once + +#include <string> +#include <vector> +#include <vespa/fastlib/text/unicodeutil.h> +#include "querymodifier.h" + +/** The internal query data structure used by the matching engine + * in Matcher.h + */ + + +// Option bit definitions: +#define X_ORDERED 0x1 // PHRASE and WITHIN operator have the ordered property +#define X_LIMIT 0x2 // NEAR and WITHIN operators have the limit property +#define X_EXACT 0x4 // PHRASE and descendants have this property +#define X_COMPLETE 0x8 // All keywords must be present (NEAR/PHRASE/WITHIN) +#define X_AND 0x10 // threshold must be recomputed when complete - AND semantics +#define X_OR 0x20 // threshold must be recomputed when complete + OR semantics +#define X_ANY 0x40 // threshold must be recomputed when complete + ANY semantics +#define X_CONSTR 0x100 // Bit telling if this subquery has constraints applied somewhere +#define X_CHKVAL 0x200 // Bit set if validity of keyword occurrences must be checked +#define X_NOT 0x400 // Limit has opposite sign (eliminate below: NOT_WITHIN semantics) +#define X_PREFIX 0x1000 // This is a prefix search (valid on terms only) +#define X_POSTFIX 0x2000 // This is a postfix search (valid on terms only) +#define X_WILD 0x4000 // This is a wildcard search (valid on terms only) +#define X_ONLY_1 0x8000 // Tell simplifier to delete all childs but #1 (RANK/ANDNOT) +#define X_SPECIALTOKEN 0x10000 // This is a special token (valid on terms only) + +class QueryNode; +class QueryTerm; + +typedef std::vector<QueryNode*> querynode_vector; + + +// Support slightly extended visitor pattern for QueryExpr nodes.. + +class IQueryExprVisitor +{ +public: + virtual ~IQueryExprVisitor() {} + + // Visit before visiting subnodes + virtual void VisitQueryNode(QueryNode*) = 0; + // visit after visiting subnodes - default: do nothing + virtual void RevisitQueryNode(QueryNode*); + + virtual void VisitQueryTerm(QueryTerm*) = 0; +}; + + +/** Base class for query expressions in Juniper */ +class QueryExpr +{ +public: + explicit QueryExpr(int weight, int arity); + explicit QueryExpr(QueryExpr* e); + + /** Add a child to the end of the list of children for this node. + * @param child A pointer to a child node to add or NULL to denote that + * we have eliminated a child from this node - to trigger an arity update + * @return A pointer to this node if more children are needed or else nearest + * parent that needs more children + */ + virtual QueryNode* AddChild(QueryExpr* child) = 0; + virtual ~QueryExpr(); + virtual int Limit() = 0; + virtual void Dump(std::string&) = 0; + virtual bool StackComplete() = 0; + virtual void ComputeThreshold(); + virtual QueryNode* AsNode() = 0; + virtual QueryTerm* AsTerm() = 0; + virtual bool Complex() = 0; + + virtual void Accept(IQueryExprVisitor& v) = 0; + + virtual int MaxArity() { return 0; } + + inline bool HasConstraints() { return _options & X_CONSTR; } + inline bool UsesValid() { return _options & X_CHKVAL; } + inline bool HasLimit() { return _options & X_LIMIT; } + inline bool Exact() { return _options & X_EXACT; } + + int _options; // Applied options (bitmap) for this node + int _weight; // Weight of this term by parent - if 0: weight is sum of children + int _arity; // Arity of this query subexpression (may get decremented..) + QueryNode* _parent; // Pointer to parent or NULL if this is the root of the query + int _childno; // Position number within parent's children (0 if no parents) + +private: + QueryExpr(QueryExpr &); + QueryExpr &operator=(QueryExpr &); +}; + + +/** Internal node of a query + */ +class QueryNode : public QueryExpr +{ +public: + // Create a new node with arity children + QueryNode(int arity, int threshold, int weight = 0); + + // Create a copy of the node n wrt. arity etc, but without adding any children.. + explicit QueryNode(QueryNode* n); + + virtual ~QueryNode(); + virtual QueryNode* AddChild(QueryExpr* child); + virtual int Limit(); + inline bool Complete() { return _arity == _nchild; } + virtual void Dump(std::string&); + virtual bool StackComplete(); + virtual void ComputeThreshold(); + virtual QueryNode* AsNode(); + virtual QueryTerm* AsTerm(); + virtual bool Complex(); + virtual int MaxArity(); + + virtual void Accept(IQueryExprVisitor& v); + + // return true if a match for n should lead to creation of a new candidate node + // corresponding to this query tree node: + bool AcceptsInitially(QueryExpr* n); + + int _threshold; // Threshold for this expression node to be considered complete + int _limit; // NEAR/WITHIN limit if X_LIMIT option set + + /* Pointer to an array of length _arity of pointers to + * subqueries associated with this query */ + QueryExpr** _children; + int _nchild; // end pointer (fill level) of _children + int _node_idx; // Index (position) of this nonterminal within table of all nonterminals + +private: + QueryNode(QueryNode &); + QueryNode &operator=(QueryNode &); +}; + + +/** Terminal node of a query + */ +class QueryTerm : public QueryExpr +{ +public: + QueryTerm(const char* t, int length, int ix, int weight = 100); + explicit QueryTerm(QueryTerm* const); + virtual ~QueryTerm(); + virtual int Limit(); + virtual QueryNode* AddChild(QueryExpr* child); + virtual void Dump(std::string&); + virtual bool StackComplete(); + virtual QueryNode* AsNode(); + virtual QueryTerm* AsTerm(); + virtual bool Complex(); + + virtual void Accept(IQueryExprVisitor& v); + inline const char* term() { return _rep; } + inline const ucs4_t* ucs4_term() { return _ucs4_term; } + inline bool is_prefix() { return _options & X_PREFIX; } + inline bool is_wildcard() { return _options & X_WILD; } + inline bool isSpecialToken() { return _options & X_SPECIALTOKEN; } + + size_t len; + size_t ucs4_len; + int total_match_cnt; + int exact_match_cnt; + int idx; + juniper::Rewriter* rewriter; + juniper::string_matcher* reduce_matcher; +private: + char* _rep; + ucs4_t* _ucs4_term; + + QueryTerm(QueryTerm &); + QueryTerm &operator=(QueryTerm &); +}; + + +/** Modify the given stack by eliminating unnecessary internal nodes + * with arity 1 or non-terms with arity 0 + */ +void SimplifyStack(QueryExpr*& orig_stack); + diff --git a/juniper/src/vespa/juniper/queryparser.cpp b/juniper/src/vespa/juniper/queryparser.cpp new file mode 100644 index 00000000000..b7e4a49fcdf --- /dev/null +++ b/juniper/src/vespa/juniper/queryparser.cpp @@ -0,0 +1,246 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* Simple prefix query parser for Juniper for debugging purposes */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.queryparser"); +#include "queryparser.h" +#include "juniperdebug.h" +#include <vector> + +#define TOK_NORM_OP 1 +#define TOK_PARAM1_OP 2 + +namespace juniper +{ + +// simple syntax tree + +class QueryItem +{ +public: + QueryItem(const char* name, int p1 = -1) : + _name(name), _index(""), _child(), _prefix(false), _p1(p1) + { } + + ~QueryItem() + { + for (std::vector<QueryItem*>::iterator it = _child.begin(); it != _child.end(); ++it) + delete *it; + } + + inline int arity() { return _child.size(); } + + void add(QueryItem* e) + { + _child.push_back(e); + LOG(debug, "Adding %s", e->_name.c_str()); + } + + std::string _name; + std::string _index; + std::vector<QueryItem*> _child; + bool _prefix; + int _p1; +}; + + +QueryParser::QueryParser(const char* query_string) : + _tokenizer(), + _op_to_type(), + _query_string(query_string), + _curtok(), + _v(NULL), + _exp(NULL), _parse_errno(0), _reached_end(false) +{ + _op_to_type["AND"] = TOK_NORM_OP; + _op_to_type["OR"] = TOK_NORM_OP; + _op_to_type["ANY"] = TOK_NORM_OP; + _op_to_type["RANK"] = TOK_NORM_OP; + _op_to_type["ANDNOT"] = TOK_NORM_OP; + _op_to_type["PHRASE"] = TOK_NORM_OP; + _op_to_type["NEAR"] = TOK_PARAM1_OP; + _op_to_type["WITHIN"] = TOK_PARAM1_OP; + _op_to_type["ONEAR"] = TOK_PARAM1_OP; + + _tokenizer.SetNewText(const_cast<char*>(_query_string), strlen(_query_string)); + if (_tokenizer.MoreTokens()) + { + next(); + _exp = ParseExpr(); + if (ParseError()) return; + } + else + { + _exp = NULL; + _parse_errno = 1; + return; + } + if (_tokenizer.MoreTokens()) + { + LOG(warning, "juniper::QueryParser: Warning: extra token(s) after end"); + _parse_errno = 2; + } +} + +void QueryParser::next() +{ + if (_reached_end) _parse_errno = 3; + if (!_tokenizer.MoreTokens()) + { + _reached_end = true; + return; + } + Tokenizer::Fast_Token token = _tokenizer.GetNextToken(); + _curtok.assign(token.first, token.second); + LOG(debug, "next: %s", _curtok.c_str()); +} + +bool QueryParser::match(const char* s, bool required) +{ + bool m = strcmp(_curtok.c_str(), s) == 0; + if (required && !m) { + LOG(warning, "juniper::QueryParser: Syntax error query string \"%s\", failed to match \"%s\"", + _query_string, s); + } + return m; +} + + +bool QueryParser::Traverse(IQueryVisitor* v) const +{ + const_cast<QueryParser*>(this)->_v = v; + if (_exp) trav(_exp); + return true; +} + + +int QueryParser::Weight(const QueryItem*) const +{ + return 100; +} + +ItemCreator QueryParser::Creator(const QueryItem*) const +{ + return CREA_ORIG; +} + +const char* QueryParser::Index(const QueryItem* e, size_t* len) const +{ + if (len) *len = e->_index.size(); + return e->_index.c_str(); +} + +bool QueryParser::UsefulIndex(const QueryItem*) const +{ + return true; +} + + +QueryParser::~QueryParser() +{ + delete _exp; +} + + +void QueryParser::trav(QueryItem* e) const +{ + if (e->arity() == 0) + _v->VisitKeyword(e, e->_name.c_str(), e->_name.size(), e->_prefix); + if (e->_name.compare("AND") == 0) _v->VisitAND(e, e->arity()); + else if (e->_name.compare("OR") == 0) _v->VisitOR(e, e->arity()); + else if (e->_name.compare("ANY") == 0) _v->VisitANY(e, e->arity()); + else if (e->_name.compare("ANDNOT") == 0) _v->VisitANDNOT(e, e->arity()); + else if (e->_name.compare("RANK") == 0) _v->VisitRANK(e, e->arity()); + else if (e->_name.compare("PHRASE") == 0) _v->VisitPHRASE(e, e->arity()); + else if (e->_name.compare("NEAR") == 0) _v->VisitNEAR(e, e->arity(), e->_p1); + else if (e->_name.compare("WITHIN") == 0) _v->VisitWITHIN(e, e->arity(), e->_p1); + else if (e->_name.compare("ONEAR") == 0) _v->VisitWITHIN(e, e->arity(), e->_p1); + + for (std::vector<QueryItem*>::iterator it = e->_child.begin(); it != e->_child.end(); ++it) + trav(*it); +} + +QueryItem* QueryParser::ParseExpr() +{ + int p1 = -1; + std::map<std::string, int>::iterator it = _op_to_type.find(_curtok); + if (it == _op_to_type.end()) + return ParseIndexTerm(); + std::string op = _curtok; + switch (it->second) + { + case TOK_NORM_OP: + break; + case TOK_PARAM1_OP: + next(); + if (!match("/", true)) return NULL; + next(); + p1 = atoi(_curtok.c_str()); + LOG(debug, "constraint operator %s - value %d", op.c_str(), p1); + break; + default: + assert(false); + } + next(); + if (!match("(", true)) return NULL; + QueryItem* e = new QueryItem(op.c_str(), p1); + do + { + if (ParseError()) return NULL; + next(); + QueryItem* ep = ParseExpr(); + if (!ep) + { + delete e; + return NULL; + } + e->add(ep); + } while (match(",")); + if (!match(")", true)) + { + delete e; + return NULL; + } + next(); + return e; +} + + +QueryItem* QueryParser::ParseIndexTerm() +{ + std::string t = _curtok; + next(); + if (match(":")) + { + next(); + LOG(debug, "ParseIndexTerm: %s:%s", t.c_str(), _curtok.c_str()); + QueryItem* e = ParseKeyword(); + if (e) e->_index = t; + return e; + } + else + return CheckPrefix(t); +} + +QueryItem* QueryParser::CheckPrefix(std::string& kw) +{ + std::string::size_type pos = kw.find_first_of("*?"); + bool prefix = pos == kw.size() - 1 && kw[pos] == '*'; + if (prefix) + kw.erase(pos); + QueryItem* e = new QueryItem(kw.c_str()); + e->_prefix = pos != std::string::npos; + return e; +} + + +QueryItem* QueryParser::ParseKeyword() +{ + LOG(debug, "ParseKeyword: %s", _curtok.c_str()); + QueryItem* e = CheckPrefix(_curtok); + next(); + return e; +} + +} // end namespace juniper diff --git a/juniper/src/vespa/juniper/queryparser.h b/juniper/src/vespa/juniper/queryparser.h new file mode 100644 index 00000000000..19de984a893 --- /dev/null +++ b/juniper/src/vespa/juniper/queryparser.h @@ -0,0 +1,64 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + + +/* Simple prefix syntax advanced query parser for Juniper debug/testing */ + +#include "query.h" +#include <map> +#include <string> +#include <vespa/fastlib/text/latintokenizer.h> + +namespace juniper +{ + +struct IsPunctuation { + bool operator()(char c) { + if (c == '*' || c == '?') + return false; + + return ispunct(static_cast<unsigned char>(c)) != 0; + } +}; + +typedef Fast_LatinTokenizer<Fast_IsSpace, IsPunctuation> WildcardTokenizer; + +class QueryParser : public IQuery +{ +private: + QueryParser(const QueryParser&); + QueryParser& operator= (const QueryParser&); +public: + QueryParser(const char* query_string); + virtual ~QueryParser(); + + virtual bool Traverse(IQueryVisitor* v) const; + virtual int Weight(const QueryItem* item) const; + virtual ItemCreator Creator(const QueryItem* item) const; + virtual const char* Index(const QueryItem* item, size_t* length) const; + virtual bool UsefulIndex(const QueryItem* item) const; + + inline int ParseError() { return _parse_errno; } +protected: + QueryItem* ParseExpr(); + QueryItem* ParseKeyword(); + QueryItem* ParseIndexTerm(); + QueryItem* CheckPrefix(std::string& kw); + void next(); + void trav(QueryItem*) const; + inline void setvisitor(IQueryVisitor* v) { _v = v; } + bool match(const char* s, bool required = false); +private: + typedef WildcardTokenizer Tokenizer; + Tokenizer _tokenizer; + std::map<std::string, int> _op_to_type; + const char* _query_string; + std::string _curtok; + IQueryVisitor* _v; + QueryItem* _exp; + int _parse_errno; + bool _reached_end; +}; + +} // end namespace juniper + diff --git a/juniper/src/vespa/juniper/queryvisitor.cpp b/juniper/src/vespa/juniper/queryvisitor.cpp new file mode 100644 index 00000000000..4d739204637 --- /dev/null +++ b/juniper/src/vespa/juniper/queryvisitor.cpp @@ -0,0 +1,320 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.queryvisitor"); +#include "query.h" +#include "juniperdebug.h" +#include "queryvisitor.h" +#include "Matcher.h" +#include "queryhandle.h" +#include "querymodifier.h" + + +/* Implementation of the query visit interface between Juniper and the query provider */ +/* Implementation note: insert() must be called for all calls in the tree to + * keep the input in sync with the stack traversal. insert will call AddChild + * which in the cases of NULL as input will update the arity accordingly. + * Any zero children nodes as result of this will be eliminated by the simplifier. + */ + +void QueryVisitor::insert(QueryExpr* expr) +{ + if (_current) + _current = _current->AddChild(expr); + else + { + // Just a sanity check that there are no overflow stack elements + if (_got_stack && expr) + { + if (_query && _query->StackComplete()) { + LOG(warning, "juniper: Overflow stack element discarded"); + } + delete expr; + return; + } + if (expr) + { + _current = _query = expr; + _got_stack = true; + } + } +} + + + +QueryVisitor::QueryVisitor(const IQuery& fquery, QueryHandle* qhandle, juniper::QueryModifier & queryModifier) : + _queryModifier(queryModifier), + _fquery(&fquery), _query(NULL), _current(NULL), _qhandle(qhandle), + _term_index(0), _got_stack(false) +{ + /* Create a query node structure by traversing the input */ + bool ok_stack = fquery.Traverse(this); + + if (ok_stack) + { + /* Do necessary post processing on the query structure to ensure that all nodes have + * complete information: + */ + postprocess_query(); + } + else if (_query) + { + delete _query; + _query = NULL; + } + _fquery = NULL; // To avoid misuse after invalidation.. +} + + +QueryVisitor::~QueryVisitor() +{ + if (_query) delete _query; +} + + +void QueryVisitor::postprocess_query() +{ + if (LOG_WOULD_LOG(debug)) { + if (_query) { + std::string s; + _query->Dump(s); + LOG(debug, "juniper input stack: %s", s.c_str()); + } else { + LOG(debug, "juniper input stack: No stack found!"); + } + } + SimplifyStack(_query); + if (!_query) return; + // convert special case of one query term to a node with 1 child: + if (_query->_arity == 0) + { + QueryNode* newroot = new QueryNode(1, _query->_weight, _query->_weight); + newroot->AddChild(_query); + _query = newroot; + } + // Handle limit in root node only for now.. + if (!_current && _qhandle->_options & X_LIMIT) + { + QueryNode* qn = _query->AsNode(); + if (qn) qn->_limit = _qhandle->_limit; + } + _query->ComputeThreshold(); +} + + + +QueryExpr* QueryVisitor::GetQuery() +{ + QueryExpr* q = _query; + _query = NULL; + return q; +} + + +bool QueryVisitor::VisitAND(const QueryItem*, int arity) +{ + LOG(debug, "juniper: VisitAND[%d]", arity); + QueryNode* node = new QueryNode(arity, -1); + node->_options = _qhandle->_options | X_AND; + insert(node); + return true; +} + +bool QueryVisitor::VisitOR(const QueryItem*, int arity) +{ + LOG(debug, "juniper: VisitOR[%d]", arity); + QueryNode* node = new QueryNode(arity, -1); + node->_options = _qhandle->_options | X_OR; + insert(node); + return true; +} + +bool QueryVisitor::VisitANY(const QueryItem*, int arity) +{ + LOG(debug, "juniper: VisitANY[%d]", arity); + QueryNode* node = new QueryNode(arity, -1); + node->_options = _qhandle->_options | X_ANY; + insert(node); + return true; +} + +bool QueryVisitor::VisitNEAR(const QueryItem*, int arity, int limit) +{ + LOG(debug, "juniper: VisitNEAR(%d)[%d]", limit, arity); + QueryNode* node = new QueryNode(arity, -1); + node->_options = _qhandle->_options | X_AND | X_LIMIT | X_COMPLETE | X_CONSTR | X_CHKVAL; + node->_limit = limit; + insert(node); + return true; +} + +bool QueryVisitor::VisitWITHIN(const QueryItem*, int arity, int limit) +{ + LOG(debug, "juniper: VisitWITHIN(%d)[%d]", limit, arity); + QueryNode* node = new QueryNode(arity, -1); + node->_options = _qhandle->_options | X_AND | X_LIMIT | X_ORDERED | X_COMPLETE | X_CONSTR + | X_CHKVAL; + node->_limit = limit; + insert(node); + return true; +} + +bool QueryVisitor::VisitRANK(const QueryItem*, int arity) +{ + LOG(debug, "juniper: VisitRANK[%d]", arity); + QueryNode* node = new QueryNode(arity, -1); + node->_options = X_ONLY_1; // Only keep first child (simpl.executed by simplifier) + insert(node); + return true; +} + +bool QueryVisitor::VisitPHRASE(const QueryItem*, int arity) +{ + LOG(debug, "juniper: VisitPHRASE[%d]", arity); + // PHRASE is identical to WITHIN(0) + exact matches only + QueryNode* node = new QueryNode(arity, -1); + node->_options = _qhandle->_options | X_AND | X_LIMIT | X_ORDERED | X_COMPLETE | X_EXACT + | X_CHKVAL; + node->_limit = 0; + insert(node); + return true; +} + +bool QueryVisitor::VisitANDNOT(const QueryItem*, int arity) +{ + LOG(debug, "juniper: VisitANDNOT[%d]", arity); + QueryNode* node = new QueryNode(arity, -1); + node->_options = X_ONLY_1; // Only keep first child (simpl.executed by simplifier) + insert(node); + return true; +} + +bool QueryVisitor::VisitTHRESHOLD(const QueryItem*, int arity, int threshold) +{ + LOG(debug, "juniper: VisitTHRESHOLD[%d,%d]", arity, threshold); + QueryNode* node = new QueryNode(arity, threshold); + node->_options = _qhandle->_options; + + insert(node); + return true; +} + + +bool QueryVisitor::VisitOther(const QueryItem*, int arity) +{ + LOG(debug, "juniper: VisitOther[%d]", arity); + insert(NULL); + return false; +} + + +std::string QueryVisitor::get_index(const QueryItem* item) +{ + size_t len; + const char* ind = _fquery->Index(item, &len); + std::string s(ind, len); + return s; +} + + +void QueryVisitor::VisitKeyword(const QueryItem* item, const char* keyword, + const size_t length, bool prefix, bool specialToken) +{ + if (length == 0) { + // Do not consider empty terms. + return; + } + juniper::ItemCreator creator = _fquery->Creator(item); + switch (creator) + { + case juniper::CREA_ORIG: + LOG(debug, "(juniper::VisitKeyword) Found valid creator '%s'", creator_text(creator)); + break; + default: + /** Keep track of eliminated children to have correct arity in rep. */ + insert(NULL); + if (LOG_WOULD_LOG(debug)) { + std::string s(keyword, length); + std::string ind = get_index(item); + LOG(debug, "juniper: VisitKeyword(%s:%s) - skip - unwanted creator %s", + ind.c_str(), s.c_str(), creator_text(creator)); + } + return; + } + + if (!_fquery->UsefulIndex(item)) + { + if (LOG_WOULD_LOG(debug)) { + std::string s(keyword, length); + std::string ind = get_index(item); + LOG(debug, "juniper: VisitKeyword(%s:%s) - not applicable index", ind.c_str(), s.c_str()); + } + insert(NULL); // keep arity of parent in sync! + return; + } + if (LOG_WOULD_LOG(debug)) { + std::string s(keyword, length); + std::string ind = get_index(item); + LOG(debug, "juniper: VisitKeyword(%s%s%s)", + ind.c_str(), (ind.size() > 0 ? ":" : ""), s.c_str()); + } + + QueryTerm* term = new QueryTerm(keyword, length, _term_index++, _fquery->Weight(item)); + if (prefix) + { + size_t tmplen = length; + while (tmplen > 0 && *keyword != '*' && *keyword != '?') + { + ++keyword; + --tmplen; + } + term->_options |= (tmplen == 0 ? X_PREFIX : X_WILD); + } + if (specialToken) { + term->_options |= X_SPECIALTOKEN; + } + if (_queryModifier.HasRewriters()) + { + size_t len; + const char* idx = _fquery->Index(item, &len); + if (idx) + { + // record any rewriter for easier lookup later on.. + juniper::Rewriter* rh = _queryModifier.FindRewriter(idx, len); + if (rh) + { + term->rewriter = rh; + if (rh->ForQuery()) + { + // Notify query handler that an expansion query cache must be + // maintained for this query: + _qhandle->SetExpansions(); + } + if (rh->ForDocument()) + { + // Notify query handler that on-the-fly document rewriting might be needed + _qhandle->SetReductions(); + } + } + } + } + insert(term); +} + + +namespace juniper +{ + +const char* creator_text(ItemCreator creator) +{ + switch (creator) + { + case CREA_ORIG: return "CREA_ORIG"; + case CREA_FILTER: return "CREA_FILTER"; + default: return "(unknown creator)"; + } +} + +} // end namespace juniper diff --git a/juniper/src/vespa/juniper/queryvisitor.h b/juniper/src/vespa/juniper/queryvisitor.h new file mode 100644 index 00000000000..28b9047dc72 --- /dev/null +++ b/juniper/src/vespa/juniper/queryvisitor.h @@ -0,0 +1,73 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +/* $Id$ */ + +#include "rpinterface.h" +#include "querynode.h" +#include <vector> + +// Juniper internal implementation of the IQueryVisitor interface as used by +// query providers (this is the initial 1.0.x structure implementation) + +typedef juniper::QueryItem QueryItem; +typedef juniper::IQuery IQuery; +typedef juniper::QueryHandle QueryHandle; + +class Matcher; + +namespace juniper { + class QueryModifier; +} + +/** See IQueryVisitor for detailed interface description + */ +class QueryVisitor : public juniper::IQueryVisitor +{ +public: + QueryVisitor(const IQuery& query, QueryHandle* qhandle, juniper::QueryModifier & queryModifier); + virtual ~QueryVisitor(); + + virtual bool VisitAND(const QueryItem* item, int arity); + virtual bool VisitOR(const QueryItem* item, int arity); + virtual bool VisitANY(const QueryItem* item, int arity); + virtual bool VisitNEAR(const QueryItem* item, int arity, int limit); + virtual bool VisitWITHIN(const QueryItem* item, int arity, int limit); + virtual bool VisitRANK(const QueryItem* item, int arity); + virtual bool VisitPHRASE(const QueryItem* item, int arity); + virtual bool VisitANDNOT(const QueryItem* item, int arity); + virtual bool VisitTHRESHOLD(const QueryItem* item, int arity, int threshold); + virtual bool VisitOther(const QueryItem* item, int arity); + virtual void VisitKeyword(const QueryItem* item, const char* keyword, + const size_t length = 0, bool prefix = false, bool specialToken = false); + + /** Grab pointer to (and ownership of) the query structure generated by this visitor. + * The call releases the query structure from this visitor. + * @return The root node in the generated query tree now to be managed and + * subsequently deleted by caller + */ + QueryExpr* GetQuery(); + +protected: + std::string get_index(const QueryItem* item); +private: + // Helper functions/members for use during construction only. + void update_parameters(const char* options); + void insert(QueryExpr* expr); + void postprocess_query(); + juniper::QueryModifier & _queryModifier; + + const IQuery* _fquery; // Temp.pointer to the input query (valid in constructor only.. + + // Members valid after init.. + QueryExpr* _query; // Root of query + QueryExpr* _current; // Current position in query tree + QueryHandle* _qhandle; + int _term_index; + bool _got_stack; // Set when we have created a stack root + + QueryVisitor(QueryVisitor &); + QueryVisitor &operator=(QueryVisitor &); +}; + + diff --git a/juniper/src/vespa/juniper/reducematcher.cpp b/juniper/src/vespa/juniper/reducematcher.cpp new file mode 100644 index 00000000000..68b2452d03c --- /dev/null +++ b/juniper/src/vespa/juniper/reducematcher.cpp @@ -0,0 +1,93 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.reducematcher"); +#include "juniperdebug.h" +#include "reducematcher.h" +#include "querynode.h" + +namespace juniper +{ + +ReduceMatcher::ReduceMatcher() + : _matchers() +{ } + +void string_matcher::add_term(QueryTerm* t) +{ + string_match_table::iterator it = _table.find(t->term()); + if (it == _table.end()) + { + std::pair<string_match_table::iterator,bool> p = + _table.insert(std::make_pair(t->term(), std::vector<QueryTerm*>())); + it = p.first; + } + it->second.push_back(t); +} + + +ReduceMatcher::~ReduceMatcher() +{ + _matchers.delete_second(); +} + + +string_matcher* ReduceMatcher::find(Rewriter* rw) +{ + string_matcher* sm = _matchers.find(rw); + if (!sm) + sm = _matchers.insert(rw, new string_matcher(rw)); + return sm; +} + + +const std::vector<QueryTerm*>* ReduceMatcher::match(uint32_t langid, + const char* term, size_t len) +{ + std::vector<QueryTerm*>* vp = NULL; + // Try each of the matchers + for (std::map<Rewriter*,string_matcher*>::iterator mit = _matchers.begin(); + mit != _matchers.end(); ++mit) + { + string_matcher* m = mit->second; + // Expand term to all its forms: + RewriteHandle* rh = m->rewriter()->Rewrite(langid, term, len); + + size_t elen; + const char* eterm = m->rewriter()->NextTerm(rh, elen); + while (eterm) + { + std::string t(eterm, elen); + string_match_table::iterator sit = m->lookup(t); + if (LOG_WOULD_LOG(spam)) { + std::string s(m->dump()); + LOG(spam, "(reduction) matching '%s' with %s", + t.c_str(), s.c_str()); + } + if (sit != m->table().end()) + { + if (!vp) { vp = new std::vector<QueryTerm*>(sit->second); + } else { + vp->insert(vp->end(),sit->second.begin(), sit->second.end()); } + } + eterm = m->rewriter()->NextTerm(rh, elen); + } + } // for (each matcher) + LOG(spam, "reduction yielded %ld query term hits", (vp ? vp->size() : 0l)); + return vp; +} // match() + + +std::string string_matcher::dump() +{ + std::string s("["); + for (string_match_table::iterator it = _table.begin(); + it != _table.end(); ++it) + { + s = s + it->first.c_str() + " "; + } + s = s + "]"; + return s; +} + +} // end namespace juniper diff --git a/juniper/src/vespa/juniper/reducematcher.h b/juniper/src/vespa/juniper/reducematcher.h new file mode 100644 index 00000000000..cc50f6ec518 --- /dev/null +++ b/juniper/src/vespa/juniper/reducematcher.h @@ -0,0 +1,48 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "rewriter.h" +#include "querymodifier.h" +#include "simplemap.h" + +namespace juniper +{ + +typedef std::map<std::string, std::vector<QueryTerm*> > string_match_table; + +class string_matcher +{ +public: + explicit string_matcher(Rewriter* rw) : _rewriter(rw), _table() {} + + void add_term(QueryTerm* t); + + inline string_match_table::iterator lookup(std::string& key) + { return _table.find(key); } + inline Rewriter* rewriter() const { return _rewriter; } + inline string_match_table& table() { return _table; } + + inline bool operator==(const string_matcher& m) { return _rewriter == m.rewriter(); } + std::string dump(); // for debugging +private: + Rewriter* _rewriter; + string_match_table _table; + + string_matcher(string_matcher &); + string_matcher &operator=(string_matcher &); +}; + +class ReduceMatcher +{ +public: + ReduceMatcher(); + ~ReduceMatcher(); + string_matcher* find(Rewriter* rw); + const std::vector<QueryTerm*>* match(uint32_t langid, const char* term, size_t len); +private: + simplemap<Rewriter*, string_matcher*> _matchers; +}; + +} // end namespace juniper + + diff --git a/juniper/src/vespa/juniper/result.cpp b/juniper/src/vespa/juniper/result.cpp new file mode 100644 index 00000000000..66a1a5a8656 --- /dev/null +++ b/juniper/src/vespa/juniper/result.cpp @@ -0,0 +1,212 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.result"); +#define _NEED_SUMMARY_CONFIG_IMPL 1 +#include "SummaryConfig.h" +#include "rpinterface.h" +#include "result.h" +#include "juniperparams.h" +#include "Matcher.h" +#include "tokenizer.h" +#include "config.h" +#include "appender.h" + +namespace juniper +{ + +/** Actual implementation of Juniper generated summaries. */ +class SummaryImpl : public Summary +{ +public: + explicit SummaryImpl() : _text("") {} + explicit SummaryImpl(const std::string& t) : _text(t) {} + virtual ~SummaryImpl() {} + virtual const char* Text() const { return _text.c_str(); } + virtual size_t Length() const { return _text.size(); } + std::string _text; +}; + + +Result::Result(Config* config, QueryHandle* qhandle, + const char* docsum, size_t docsum_len, uint32_t langid) : + _qhandle(qhandle), + _mo(qhandle->MatchObj(langid)), + _docsum(docsum), + _docsum_len(docsum_len), + _langid(langid), + _config(config), + _matcher(), + _tokenizer(), + _summaries(), + _scan_done(false), + _dynsum_len(-1), + _max_matches(-1), + _surround_max(-1), + _stem_min(0), + _stem_extend(0), + _winsize(0), + _winsize_fallback_multiplier(10.0), + _max_match_candidates(1000) +{ + if (!_mo) return; // The empty result.. + + MatcherParams& mp = _config->_matcherparams; + Fast_WordFolder* wordfolder = mp.WordFolder(); + + if (_qhandle->_stem_min < 0) + _stem_min = mp.StemMinLength(); + else + _stem_min = _qhandle->_stem_min; + + if (_qhandle->_stem_extend < 0) + _stem_extend = mp.StemMaxExtend(); + else + _stem_extend = _qhandle->_stem_extend; + + if (_qhandle->_winsize < 0) + _winsize = mp.MatchWindowSize(); + else + _winsize = _qhandle->_winsize; + + if (_qhandle->_winsize_fallback_multiplier < 0) + _winsize_fallback_multiplier = mp.MatchWindowSizeFallbackMultiplier(); + else + _winsize_fallback_multiplier = _qhandle->_winsize_fallback_multiplier; + + if (_qhandle->_max_match_candidates < 0) { + _max_match_candidates = mp.MaxMatchCandidates(); + } else { + _max_match_candidates = _qhandle->_max_match_candidates; + } + + /* Create the new pipeline */ + _tokenizer.reset(new JuniperTokenizer(wordfolder, NULL, 0, NULL)); + + _matcher.reset(new Matcher(this)); + _matcher->SetProximityFactor(mp.ProximityFactor()); + + _registry.reset(new SpecialTokenRegistry(_matcher->getQuery())); + + if (qhandle->_log_mask) + _matcher->set_log(qhandle->_log_mask); + + _tokenizer->SetSuccessor(_matcher.get()); + if (!_registry->getSpecialTokens().empty()) { + _tokenizer->setRegistry(_registry.get()); + } +} + +Result::~Result() +{ + delete_all(_summaries); +} + + +long Result::GetRelevancy() +{ + if (!_mo) return PROXIMITYBOOST_NOCONSTRAINT_OFFSET; + if (!_mo->Query()) return PROXIMITYBOOST_NOCONSTRAINT_OFFSET; + Scan(); + long retval = _matcher->GlobalRank(); + LOG(debug, "juniper::GetRelevancy(%lu)", retval); + return retval; +} + +Summary* Result::GetTeaser(const Config* alt_config) +{ + LOG(debug, "juniper::GetTeaser"); + const Config* cfg = (alt_config ? alt_config : _config); + const DocsumParams& dsp = cfg->_docsumparams; + if (_qhandle->_dynsum_len < 0) + _dynsum_len = dsp.Length(); + else + _dynsum_len = _qhandle->_dynsum_len; + SummaryImpl *sum = NULL; + // Avoid overhead when being called with an empty stack + if (_mo && _mo->Query()) { + Scan(); + if (_qhandle->_max_matches < 0) + _max_matches = dsp.MaxMatches(); + else + _max_matches = _qhandle->_max_matches; + if (_qhandle->_surround_max < 0) + _surround_max = dsp.SurroundMax(); + else + _surround_max = _qhandle->_surround_max; + + SummaryDesc* sdesc = + _matcher->CreateSummaryDesc(_dynsum_len, dsp.MinLength(), _max_matches, _surround_max); + + if (sdesc) { + size_t char_size; + sum = new SummaryImpl(BuildSummary(_docsum, _docsum_len, sdesc, cfg->_sumconf, char_size)); + DeleteSummaryDesc(sdesc); + } + } + + if (sum == NULL) { + sum = new SummaryImpl(); + } + + if (sum->_text.empty() && dsp.Fallback() == DocsumParams::FALLBACK_PREFIX) + { + std::vector<char> text; + Appender a(cfg->_sumconf); + ucs4_t buf[TOKEN_DSTLEN]; + const char *src = _docsum; + const char *src_end = _docsum + _docsum_len; + ucs4_t *dst = buf; + ucs4_t *dst_end = dst + TOKEN_DSTLEN; + Fast_WordFolder *folder = _config->_matcherparams.WordFolder(); + + text.reserve(_dynsum_len*2); + if (src_end - src <= _dynsum_len) { + a.append(text, src, src_end - src); + src = src_end; // ensure while loop not run + } + while (src < src_end) + { + const char *startpos; + size_t tokenLen; + const char *old_src = src; + size_t old_sum_len = text.size(); + src = folder->UCS4Tokenize(src, src_end, dst, dst_end, + startpos, tokenLen); + if (dst[0] == 0) { + a.append(text, old_src, src_end - old_src); + src = src_end; // ensure loop termination + } else { + a.append(text, old_src, src - old_src); + } + if (text.size() > (size_t) _dynsum_len) { + text.resize(old_sum_len); + text.insert(text.end(), cfg->_sumconf->dots().begin(), cfg->_sumconf->dots().end()); + break; + } + } + sum->_text = std::string(&text[0], text.size()); + } + _summaries.push_back(sum); + return sum; +} + + +Summary* Result::GetLog() +{ + // Avoid overhead when being called with an empty stack + Summary* sum = NULL; + if (_mo && _mo->Query()) + { + LOG(debug, "juniper::GetLog"); + Scan(); + sum = new SummaryImpl(_matcher->GetLog()); + } + else + sum = new SummaryImpl(); + _summaries.push_back(sum); + return sum; +} + + +} // end namespace juniper diff --git a/juniper/src/vespa/juniper/result.h b/juniper/src/vespa/juniper/result.h new file mode 100644 index 00000000000..07e966acae3 --- /dev/null +++ b/juniper/src/vespa/juniper/result.h @@ -0,0 +1,70 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "queryhandle.h" +#include "tokenizer.h" +#include "juniperdebug.h" +#include <memory> + +namespace juniper +{ + +#define PROXIMITYBOOST_NOCONSTRAINT_OFFSET 2 + +class Result +{ +public: + Result(Config* config, QueryHandle* qhandle, + const char* docsum, size_t docsum_len, uint32_t langid); + ~Result(); + + inline void Scan() + { + if (!_scan_done) + { + _tokenizer->SetText(_docsum, _docsum_len); + _tokenizer->scan(); + _scan_done = true; + } + } + + long GetRelevancy(); + size_t StemMin() const { return _stem_min; } + size_t StemExt() const { return _stem_extend; } + size_t WinSize() const { return _winsize; } + double WinSizeFallbackMultiplier() const { return _winsize_fallback_multiplier; } + size_t MaxMatchCandidates() const { return _max_match_candidates; } + Summary* GetTeaser(const Config* alt_config); + Summary* GetLog(); + + QueryHandle* _qhandle; + MatchObject* _mo; + const char* _docsum; + size_t _docsum_len; + uint32_t _langid; + Config* _config; + std::unique_ptr<Matcher> _matcher; + std::unique_ptr<SpecialTokenRegistry> _registry; + std::unique_ptr<JuniperTokenizer> _tokenizer; +private: + std::vector<Summary*> _summaries; // Active summaries for this result + bool _scan_done; // State of the result - is text scan done? + + /* Option storage */ + int _dynsum_len; // Dynamic summary length + int _max_matches; // Number of matches in summary + int _surround_max; // Max surrounding characters of a keyword in a teaser + size_t _stem_min; // Min.size of word to apply "fuzzy" matching + // The max number of characters to allow a word to contain in addition to the + // search keyword prefix for it to match (set this to 0 to disable stemming!) + // default 3 + size_t _stem_extend; + size_t _winsize; // Window size to use when matching + double _winsize_fallback_multiplier; + size_t _max_match_candidates; + + Result(Result &); + Result &operator=(Result &); +}; + +} // end namespace juniper diff --git a/juniper/src/vespa/juniper/rewriter.h b/juniper/src/vespa/juniper/rewriter.h new file mode 100644 index 00000000000..ce46c83214a --- /dev/null +++ b/juniper/src/vespa/juniper/rewriter.h @@ -0,0 +1,59 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#pragma once + +#include <vespa/fastos/fastos.h> +#include "query.h" + +/** @file rewriter.h + * This file describes describes Juniper's expected interface for + * query word rewrite (typically language dependent) + * Rewrite of certain query words into a list of words + * are enabled by issuing an AddRewriter call. + * + * The AddRewriter call is implemented by Juniper. + * The IRewriter interface must be implemented by the caller module + * to serve calls from juniper when the particular ItemCreator is found. + * Subsequent AddRewriter calls using the same ItemCreator will override + * the previous setting (rewrite and/or reduction for that creator. + * Multiple AddRewriter calls with different creator values are accepted. + */ + +namespace juniper +{ + +// Opaque handle only used by implementer: +class RewriteHandle; + + +class IRewriter +{ +public: + virtual ~IRewriter() {} + + /** return the name of this particular rewriter (for debugging purposes) */ + virtual const char* Name() const = 0; + + /** Map the given term to its rewritten form(s) wrt. the given language + * represented with language identifiers compatible with the + * ones used in the Analyse calls (rpinterface.h) + * @return a handle that can be used to retrieve words + * representing the rewritten forms. A NULL return value means + * that no rewrites exist and that the original form should be used. + */ + virtual RewriteHandle* Rewrite(uint32_t langid, const char* term) = 0; + virtual RewriteHandle* Rewrite(uint32_t langid, const char* term, size_t length) = 0; + + /** Retrieve the next term from the RewriteHandle object + * To be used repeatedly by Juniper until NULL is returned to + * signal that there are no more rewrites. + * At this point the RewriteHandle object and all the returned terms + * may become invalid. Juniper will either retrieve all terms returned + * by a Map call OR call the ReleaseHandle call: + */ + virtual const char* NextTerm(RewriteHandle* exp, size_t& length) = 0; +}; + +} // end namespace juniper + diff --git a/juniper/src/vespa/juniper/rpinterface.cpp b/juniper/src/vespa/juniper/rpinterface.cpp new file mode 100644 index 00000000000..285eb9bacd6 --- /dev/null +++ b/juniper/src/vespa/juniper/rpinterface.cpp @@ -0,0 +1,151 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.rpinterface"); +#include "rpinterface.h" +#include <string> +#include <vector> +#include "juniperparams.h" +#include "foreach_utils.h" +#include "juniperdebug.h" +#include "SummaryConfig.h" +#include "queryvisitor.h" +#include "querynode.h" +#include "queryhandle.h" +#include "propreader.h" +#include "result.h" +#include "config.h" +#include "querymodifier.h" +#include <vespa/fastlib/text/normwordfolder.h> + +/* Implementation of the interface between Juniper and the content/query provider */ + +namespace juniper +{ + +bool AnalyseCompatible(Config* conf1, Config* conf2) +{ + return conf1 == conf2 || + (conf1 && conf2 && conf1->_matcherparams == conf2->_matcherparams + && conf1->_docsumparams.Length() == conf2->_docsumparams.Length()); +} + +void SetDebug(unsigned int mask) +{ +#ifdef FASTOS_DEBUG + if (mask & ~1 && mask != debug_level) + LOG(info, "Juniper debug mode enabled (0x%x)", mask); + else if (! (debug_level & ~1)) + LOG(info, "Juniper debug mode disabled (0x%x)", mask); + debug_level = mask; +#else + // Make sure we do not get 200 of these warnings per query.. + static bool warning_printed = false; + if (mask && !warning_printed) + { + LOG(warning, + "Juniper debug mode requested in binary compiled without debug support!"); + warning_printed = true; + } +#endif +} + + +Juniper::Juniper(IJuniperProperties* props, Fast_WordFolder* wordfolder, int api_version) : + _props(props), + _wordfolder(wordfolder), + _modifier(new QueryModifier()) +{ + if (api_version != JUNIPER_RP_ABI_VERSION) + { + // This can happen if fsearch and juniper is not compiled with the same version of the + // Juniper API header files. + LOG(error, "FATAL: " + "juniper::Init: incompatible ABI versions between Juniper(%d) and caller (%d)!", + JUNIPER_RP_ABI_VERSION, api_version); + } + + assert(props); + assert(wordfolder); + + LOG(debug, "Juniper result processor (interface v.%d)", JUNIPER_RP_ABI_VERSION); + + unsigned int debug_mask = strtol(_props->GetProperty("juniper.debug_mask", "0"), NULL, 0); + if (debug_mask) SetDebug(debug_mask); + +} + +Juniper::~Juniper() +{ +} + +std::unique_ptr<Config> Juniper::CreateConfig(const char* config_name) +{ + return std::unique_ptr<Config>(new Config(config_name, *this)); +} + +QueryHandle* Juniper::CreateQueryHandle(const IQuery& fquery, const char* juniperoptions) +{ + return new QueryHandle(fquery, juniperoptions, *_modifier); +} + +void Juniper::AddRewriter(const char* index_name, IRewriter* rewriter, bool for_query, bool for_document) +{ + _modifier->AddRewriter(index_name, rewriter, for_query, for_document); +} + +void Juniper::FlushRewriters() +{ + _modifier->FlushRewriters(); +} + +void ReleaseConfig(Config*& config) +{ + delete config; + config = NULL; +} + + +void ReleaseQueryHandle(QueryHandle*& handle) +{ + delete handle; + handle = NULL; +} + + +Result* Analyse(const Config* config, QueryHandle* qhandle, + const char* docsum, size_t docsum_len, + uint32_t docid, uint32_t /* inputfield_id */, + uint32_t langid) +{ + LOG(debug, "juniper::Analyse(): docId(%u), docsumLen(%zu), docsum(%s), langId(%u)", + docid, docsum_len, docsum, langid); + Result* res = new Result(const_cast<Config*>(config), qhandle, docsum, docsum_len, langid); + return res; +} + +long GetRelevancy(Result* result_handle) +{ + return result_handle->GetRelevancy(); +} + +Summary* GetTeaser(Result* result_handle, const Config* alt_config) +{ + return result_handle->GetTeaser(alt_config); +} + +Summary* GetLog(Result* result_handle) +{ + return result_handle->GetLog(); +} + +void ReleaseResult(Result*& result_handle) +{ + LOG(debug, "juniper::ReleaseResult"); + delete result_handle; + result_handle = NULL; +} + +} // end namespace juniper diff --git a/juniper/src/vespa/juniper/rpinterface.h b/juniper/src/vespa/juniper/rpinterface.h new file mode 100644 index 00000000000..3c04df001ab --- /dev/null +++ b/juniper/src/vespa/juniper/rpinterface.h @@ -0,0 +1,214 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#pragma once + +#include "IJuniperProperties.h" +#include "rewriter.h" +#include <memory> + +/** @file rpinterface.h This file is the main include file for the advanced + * result processing interface to Juniper. The complete set of new interfaces + * to Juniper as of Juniper v.2.x.x is contained in the juniper namespace. + * This file together with query.h is the result processing part of these interfaces. + * The other part is the indexing/document processing interface with main include file + * dpinterface.h + */ + +/** This define will be changed only in case of backward incompatible + * API changes - we use 2 initially to avoid confusion with Juniper 1.0.x.. + */ +#define JUNIPER_RP_ABI_VERSION 3 + +/* Changes to this version number indicates minor interface additions + * where the original interface is kept unchanged. Can be used to test for features. + */ +#define JUNIPER_RP_API_MINOR_VERSION 1 + +class Fast_WordFolder; + +/** This is the new query/document provider interface to Juniper as of Juniper 2.0.x + * It replaces the Juniper 1.0.x interface on the result processing side, previously + * defined by simpledynsum.h . + * While the old interface (simpledynsum.h) is kept for backward compatibility, it is + * depreciated as it allows less flexibility and thus provides lower quality teasers. + */ +namespace juniper +{ + +/** Version tag generated from Makefile/configure system */ +extern const char* version_tag; + +/** Opaque object encapsulating a default configuration set for Juniper. + * Multiple such configurations can co-exist, for instance to allow different + * summary fields to use different teaser configurations. + * Note that in addition to this (relatively static) configuration set, + * configuration parameters can be overridden on a per query basis by means of the + * juniperoptions query parameter. See the Juniper 2.x.x documentation for details. + */ +class Config; + +/** Opaque object encapsulating state associated with a particular query + */ +class QueryHandle; + +/** Opaque object encapsulating the result of a partial or full Juniper + * analysis of a document. + */ +class Result; + +class QueryModifier; + +class Summary +{ +public: + virtual ~Summary() {} + // The textual representation of the generated summary + virtual const char* Text() const = 0; + virtual size_t Length() const = 0; +}; + +class Juniper { +public: + /** + * Convenience typedefs. + */ + typedef std::unique_ptr<Juniper> UP; + typedef std::shared_ptr<Juniper> SP; + + /** Initialize the Juniper subsystem. + * @param props A pointer to the object containing all available configuration + * property values for the Juniper parameters. + * @param wordfolder A pointer to a custom wordfolder object to use. If + * NULL, a default wordfolder will be maintained by Juniper if necessary. + * In case of errors during initialization or config object creation, + * the cause will be appropriately reported to the @param log object + * with status ELOG_CRITICAL + * @param api_version Version check parameter + * - should always be left with the default value to ensure binary backward + * compatibility between versions. + */ + Juniper(IJuniperProperties* props, + Fast_WordFolder* wordfolder, int api_version = JUNIPER_RP_ABI_VERSION); + /** Deinitialize the Juniper subsystem. Release all remaining resources + * associated with Juniper - reverse the effect of the Init function. + * Assumes that all Result objects have been released. + */ + ~Juniper(); + + Fast_WordFolder & getWordFolder() { return *_wordfolder; } + IJuniperProperties & getProp() { return *_props; } + QueryModifier & getModifier() { return *_modifier; } + + /** Create a result processing configuration of Juniper for subsequent use + * @param config_name a symbolic prefix to be used in the fsearch configuration file + * (fsearchrc/fsearch.addon*). The default value reflects the Juniper 1.x.x usage where + * Juniper configuration variables are supplied as "juniper.dynsum.length value" pairs. + * If a configuration object gets a config name of "mysummaryfield", then + * if "mysummaryfield.dynsum.length exists as a property in the config file, + * then that value is used, otherwise the default "juniper.dynsum.length" value is used. + * @return a nonzero object for subsequent reference if initialization is done, + * NULL if an error occurred. + */ + + std::unique_ptr<Config> CreateConfig(const char* config_name = "juniper"); + /** Allocate a query handle for the given query for subsequent calls to Analyse + * for different hits. Performs the necessary per query processing for Juniper. + * @param query A query to start result processing for. + * @param juniperoptions The value of the special juniperoption URL parameter + * provided for this search. This parameter is parsed by Juniper to support optional + * behaviour such as user customization of teaser parameters, selectively + * enabling of Juniper debugging/tracing features and to support Juniper extensions + * to the query language. + * @return An allocated handle to be subsequently released by ReleaseQueryHandle() + */ + QueryHandle* CreateQueryHandle(const IQuery& query, const char* juniperoptions); + + /** Add an rewriter for all terms that are prefixed with the given index. + * When Juniper encounter a term in the query tagged with this index, + * Juniper assumes that that term has been subject to expansion, and will + * apply the rewriter to all terms in all analysed documents before + * matching with the query. + */ + void AddRewriter(const char* index_name, IRewriter* rewriter, bool for_query, bool for_document); + + // Mostly for testing - being able to start with clean sheets for each test: + void FlushRewriters(); + +private: + IJuniperProperties * _props; + Fast_WordFolder * _wordfolder; + std::unique_ptr<QueryModifier> _modifier; +}; + +/** This function defines an equality relation over Juniper configs, + * @return true if a previously acquired result handle (through use of + * one of the Config objects can be reused (typically to produce a + * differently looking teaser) with the other Config object. + * This is the case if the two config objects only differ in the teaser + * parameters (eg. those named *.dynsum.*) + */ +bool AnalyseCompatible(Config* conf1, Config* conf2); + +/** Release a QueryHandle as previously allocated by CreateQueryHandle. + * @param handle The QueryHandle object to release + */ +void ReleaseQueryHandle(QueryHandle*& handle); + +/** Perform initial content analysis on a query/content pair. + * Note that the content may either be a simple UTF-8 encoded string or a + * more advanced representation including document structure elements, as provided + * by the Juniper document processing interface (see dpinterface.h) + * @param config A valid pointer to the parameter configuration to use for the analysis + * @param query The query, represented by a QueryHandle to base the analysis on. + * (previously generated by CreateQueryHandle) + * @param docsum A reference to a document summary to be analysed. + * @param docsum_len The length in bytes of the document summary, including + any meta information. + * @param docid A 32 bit number uniquely identifying the document to be analysed + * @param inputfield_id A 32 bit number uniquely identifying the summary field + within the document that contains the provided document summary. + * @param langid A unique 32 bit id representing the language which + this document summary is to be analysed in context of. + * @return A pointer to an allocated handle to be used in subsequent specific result + * requests (must later be released with ReleaseResult()) + */ +Result* Analyse(const Config* config, QueryHandle* query, + const char* docsum, size_t docsum_len, + uint32_t docid, uint32_t inputfield_id, + uint32_t langid); + +/** Get the computed relevancy of the processed content from the result. + * @param result_handle The result to retrieve from + * @return The relevancy (proximitymetric) of the processed content. + */ +long GetRelevancy(Result* result_handle); + +/** Generate a teaser based on the provided analysis result + * @param result_handle a handle obtained by a previous call to Analyse + * @param alt_config An optional alternate config to use for this teaser generation + * The purpose of alt_config is to allow generation of multiple teasers + * based on the same content and analysis. + * @return The generated Teaser object. This object is valid until ReleaseResult + * is called for result_handle + */ +Summary* GetTeaser(Result* result_handle, const Config* alt_config = NULL); + +/** Retrieve log information based on the previous calls to this result handle. + * Note that for the log to be complete, the juniper log override entry in + * the summary field map must be placed after any other juniper override directives. + * @param result_handle a handle obtained by a previous call to Analyse. + * @return value: a summary description containing the Juniper log as a text field + * if any log information is available, or else an empty summary. + * This object is valid until ReleaseResult is called for result_handle + */ +Summary* GetLog(Result* result_handle); + +/** Release all resources associated with the handle given including the + * summaries created by this result handle. + * @param result_handle The handle to release + */ +void ReleaseResult(Result*& result_handle); + +} // end namespace juniper + diff --git a/juniper/src/vespa/juniper/simplemap.h b/juniper/src/vespa/juniper/simplemap.h new file mode 100644 index 00000000000..dcaf3ac67dc --- /dev/null +++ b/juniper/src/vespa/juniper/simplemap.h @@ -0,0 +1,61 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <map> + +template<typename _key, typename _val> +class simplemap +{ +public: + explicit simplemap() : _map() {} + + explicit simplemap(simplemap& m) + { + _map = m.map(); + } + + virtual ~simplemap() {} + + _val insert(_key key, _val val) + { + typename std::pair<typename std::map<_key,_val>::iterator, bool> p = + _map.insert(std::make_pair(key, val)); + if (p.second) return p.first->second; + return NULL; + } + + _val find(_key key) + { + typename std::map<_key,_val>::iterator it = _map.find(key); + if (it != _map.end()) + return it->second; + else + return NULL; + } + + size_t size() { return _map.size(); } + + typename std::map<_key,_val>::iterator begin() { return _map.begin(); } + typename std::map<_key,_val>::iterator end() { return _map.end(); } + + void delete_second() + { + typename std::map<_key,_val>::iterator it(_map.begin()); + for (;it != _map.end(); ++it) + { + delete(it->second); + it->second = NULL; + } + } + + void clear() + { + _map.clear(); + } +protected: + std::map<_key,_val>& map() { return _map; } +private: + std::map<_key,_val> _map; +}; + + diff --git a/juniper/src/vespa/juniper/specialtokenregistry.cpp b/juniper/src/vespa/juniper/specialtokenregistry.cpp new file mode 100644 index 00000000000..a042fae94be --- /dev/null +++ b/juniper/src/vespa/juniper/specialtokenregistry.cpp @@ -0,0 +1,114 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.specialtokenregistry"); +#include "specialtokenregistry.h" + +namespace { + +class QueryVisitor : public IQueryExprVisitor { +private: + juniper::SpecialTokenRegistry & _registry; + +public: + QueryVisitor(juniper::SpecialTokenRegistry & registry) : _registry(registry) {} + virtual void VisitQueryNode(QueryNode *) { } + virtual void RevisitQueryNode(QueryNode *) { } + virtual void VisitQueryTerm(QueryTerm * t) { + if (t->isSpecialToken()) { + _registry.addSpecialToken(t); + } + } +}; + +} + +namespace juniper { + + +SpecialTokenRegistry::CharStream::CharStream(const char * srcBuf, const char * srcEnd, + ucs4_t * dstBuf, ucs4_t * dstEnd) : + + _srcBuf(srcBuf), + _srcItr(srcBuf), + _srcEnd(srcEnd), + _nextStart(srcBuf), + _dstBuf(dstBuf), + _dstItr(dstBuf), + _dstEnd(dstEnd), + _isStartWordChar(false) +{ + if (srcBuf < srcEnd) { + ucs4_t ch = getNextChar(); + _nextStart = _srcItr; + _isStartWordChar = Fast_UnicodeUtil::IsWordChar(ch); + reset(); + } +} + +bool +SpecialTokenRegistry::CharStream::resetAndInc() +{ + _srcItr = _nextStart; + if (hasMoreChars()) { + ucs4_t ch = getNextChar(); + _isStartWordChar = Fast_UnicodeUtil::IsWordChar(ch); + _srcBuf = _nextStart; // move start to next character + _nextStart = _srcItr; // move next start to the next next character + reset(); + return true; + } else { + return false; + } +} + + +bool +SpecialTokenRegistry::match(const ucs4_t * qsrc, const ucs4_t * qend, CharStream & stream) const +{ + for (; (qsrc < qend) && stream.hasMoreChars(); ++qsrc) { + ucs4_t ch = stream.getNextChar(); + if (ch != *qsrc) { + return false; + } + } + return (qsrc == qend); +} + +SpecialTokenRegistry::SpecialTokenRegistry(QueryExpr * query) : + _specialTokens() +{ + QueryVisitor qv(*this); + query->Accept(qv); // find the special tokens +} + +const char * +SpecialTokenRegistry::tokenize(const char * buf, const char * bufend, + ucs4_t * dstbuf, ucs4_t * dstbufend, + const char * & origstart, size_t & tokenlen) const +{ + CharStream stream(buf, bufend, dstbuf, dstbufend); + bool foundWordChar = false; + while(!foundWordChar && stream.hasMoreChars() && stream.hasMoreSpace()) { + for (size_t i = 0; i < _specialTokens.size(); ++i) { + const ucs4_t * qsrc = _specialTokens[i]->ucs4_term(); + const ucs4_t * qend = qsrc + _specialTokens[i]->ucs4_len; + // try to match the given special token with the input stream + if (match(qsrc, qend, stream)) { + origstart = stream.getSrcStart(); + tokenlen = stream.getNumChars(); + return stream.getSrcItr(); + } + stream.reset(); + } + foundWordChar = stream.isStartWordChar(); + stream.resetAndInc(); + } + + return NULL; +} + +} // namespace juniper + + diff --git a/juniper/src/vespa/juniper/specialtokenregistry.h b/juniper/src/vespa/juniper/specialtokenregistry.h new file mode 100644 index 00000000000..683a715f52d --- /dev/null +++ b/juniper/src/vespa/juniper/specialtokenregistry.h @@ -0,0 +1,80 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vector> +#include "querynode.h" +#include <vespa/vespalib/text/lowercase.h> + +namespace juniper { + +/** + * This registry is responsible for knowing the set of query terms that are marked as special tokens. + * The class operates on a character stream and tries to tokenize this into special tokens. + */ +class SpecialTokenRegistry +{ +public: + /** + * Helper class for handling a character stream. + */ + class CharStream { + private: + const char * _srcBuf; // the current start of the source buffer + const char * _srcItr; // the source iterator + const char * _srcEnd; // the end of the source buffer + const char * _nextStart; // the next start character + ucs4_t * _dstBuf; // the start of the destination buffer + ucs4_t * _dstItr; // the destination iterator + ucs4_t * _dstEnd; // the end of the destination buffer + bool _isStartWordChar; + + public: + CharStream(const char * srcBuf, const char * srcEnd, + ucs4_t * dstBuf, ucs4_t * dstEnd); + bool hasMoreChars() const { return _srcItr < _srcEnd; } + bool hasMoreSpace() const { return _dstItr < _dstEnd; } + ucs4_t getNextChar() { + ucs4_t ch = Fast_UnicodeUtil::GetUTF8Char(_srcItr); + ch = vespalib::LowerCase::convert(ch); + *_dstItr++ = ch; + return ch; + } + void reset() { _srcItr = _srcBuf; _dstItr = _dstBuf; } + bool resetAndInc(); + bool isStartWordChar() const { return _isStartWordChar; } + size_t getNumChars() const { return _dstItr - _dstBuf; } + const char * getSrcStart() const { return _srcBuf; } + const char * getSrcItr() const { return _srcItr; } + }; + +private: + std::vector<QueryTerm *> _specialTokens; + + bool match(const ucs4_t * qsrc, const ucs4_t * qend, CharStream & stream) const; + +public: + SpecialTokenRegistry(QueryExpr * query); + const std::vector<QueryTerm *> & getSpecialTokens() const { return _specialTokens; } + void addSpecialToken(QueryTerm * term) { + _specialTokens.push_back(term); + } + /** + * Tries to tokenize the given utf-8 buffer (character stream) into a special token. + * Returns the new position of the buffer if a special token is matched, NULL otherwise. + * + * @param buf start position of the utf-8 buffer. + * @param bufend end position of the utf-8 buffer. + * @param dstbuf start position of the destination ucs4 buffer where the characters are copied into. + * @param dstend end position of the destination ucs4 buffer. + * @param origstart buffer start position of the token returned. + * @param tokenlen number of ucs4 characters in the returned token. + * @return new buffer position (after token) or NULL. + */ + const char * tokenize(const char * buf, const char * bufend, + ucs4_t * dstbuf, ucs4_t * dstbufend, + const char * & origstart, size_t & tokenlen) const; +}; + +} // namespace juniper + diff --git a/juniper/src/vespa/juniper/stringmap.cpp b/juniper/src/vespa/juniper/stringmap.cpp new file mode 100644 index 00000000000..52a250f6b1a --- /dev/null +++ b/juniper/src/vespa/juniper/stringmap.cpp @@ -0,0 +1,18 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.stringmap"); +#include "stringmap.h" + +void Fast_StringMap::Insert(const char* key, const char* value) +{ + _backing[key] = value; +} + + +const char * +Fast_StringMap::Lookup(const char *key, const char *defval) +{ + Map::const_iterator found(_backing.find(key)); + return (found != _backing.end()) ? found->second.c_str() : defval; +} diff --git a/juniper/src/vespa/juniper/stringmap.h b/juniper/src/vespa/juniper/stringmap.h new file mode 100644 index 00000000000..b818513f1e0 --- /dev/null +++ b/juniper/src/vespa/juniper/stringmap.h @@ -0,0 +1,19 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/vespalib/stllike/hash_map.h> + +/** A map from strings to strings where all storage are maintained internally + * a la perl assoc.arrays + */ + +class Fast_StringMap +{ +private: + typedef vespalib::hash_map<vespalib::string, vespalib::string> Map; + Map _backing; +public: + void Insert(const char* key, const char* value); + const char *Lookup(const char* key, const char* defval); +}; + diff --git a/juniper/src/vespa/juniper/sumdesc.cpp b/juniper/src/vespa/juniper/sumdesc.cpp new file mode 100644 index 00000000000..9721a84d1f5 --- /dev/null +++ b/juniper/src/vespa/juniper/sumdesc.cpp @@ -0,0 +1,899 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* $Id$ */ + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.sumdesc"); +#include "sumdesc.h" +#include "juniperdebug.h" +#include <vespa/fastlib/text/unicodeutil.h> +#include "Matcher.h" +#include "appender.h" + +/** SummaryDesc: A class of objects describing a query highlight + * dynamic summary based on the current state of the provided + * matcher. + */ + +/* a few utilities: */ + +bool wordchar(const unsigned char* s) +{ + unsigned char c = *s; + if (c & 0x80) { + ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); + return Fast_UnicodeUtil::IsWordChar(u); + } else { + return isalnum(c); + } +} + +bool nonwordchar(const unsigned char* s) +{ + unsigned char c = *s; + if (c & 0x80) { + ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); + return !Fast_UnicodeUtil::IsWordChar(u); + } else { + return !isalnum(c); + } +} + + +/* Move backwards/forwards from ptr (no longer than to start) in an + * UTF8 text until the beginning of the word or (if space, until + * beginning of the next/last word) + * @return The number of bytes moved + */ +int complete_word(unsigned char* start, ssize_t length, + const unsigned char*& ptr, off_t increment) +{ + bool (*chartest)(const unsigned char*); + int moved = 0; + bool whitespace_elim = false; + const unsigned char* orig_ptr = ptr; + + LOG(spam, "complete_word start 0x%p, length %ld, ptr 0x%p, increment %ld", + start, length, ptr, increment); + // Make sure we are at the start of a character before doing any + // comparisons + int start_off = Fast_UnicodeUtil::UTF8move(start, length, ptr, 0); + if (start_off) { + LOG(spam, "Offset %d to start of char", start_off); + } + + // Figure out if a word needs completion or if we are just going + // to eliminate whitespace + if (!wordchar(ptr)) { + whitespace_elim = true; + // Change direction of scan + increment = -increment; + chartest = wordchar; + } else { + // Found a wordchar at pointer + // If moving forwards, we need to check the previous character + // for "non-wordness". Otherwise we might add an extra word + if (increment > 0) { + const unsigned char* pre_ptr = ptr; + int cur_move = Fast_UnicodeUtil::UTF8move(start, length, + pre_ptr, -1); + if (!wordchar(pre_ptr)) // Points at start of new word + { + whitespace_elim = true; + // Change direction of scan + increment = -increment; + chartest = wordchar; + ptr = pre_ptr; + moved += cur_move; + } else { + chartest = nonwordchar; + } + } else { + chartest = nonwordchar; + } + } + + // move until we reach a space/wordchar or the beginning/end of + // the read: + for (;;) { + LOG(spam, "[%s%d%s%c]", (whitespace_elim ? "^" : ""), + moved, (increment > 0 ? "+" : "-"), *ptr); + int cur_move = Fast_UnicodeUtil::UTF8move(start, length, + ptr, increment); + + // give up if past end of read (may still be a successful move + // ending at the first character outside of the start+length + // range: (UTF8move guarantees that pointer never gets before + // start) + if (ptr >= start + length) { + LOG(spam, "complete_word: Break at end of text"); + break; + } + + // Give up if we found a split of a word + if (cur_move <= 0) // == 0 to avoid UTF8move bug in fastlib 1.3.3.. + { + LOG(spam, "complete_word: Failing at char %c/0x%x", *ptr, *ptr); + break; + } + if (chartest(ptr)) { + LOG(spam, "complete_word: Breaking at char %c/0x%x (%d)", *ptr, + *ptr, cur_move); + // count this character (it is the first blank/wordchar) + // only if we are going forward and it is a word character + // since we are then supposed to be pointing to the first + // char not in the word while going backwards we stop at + // the start character of the word! + if (increment > 0) + moved += cur_move; + break; // Found first blank/word char.. + } + moved += cur_move; + if (moved >= MAX_SCAN_WORD) { + LOG(spam, "Word length extended max word length %d, " + "breaking at char 0x%x", MAX_SCAN_WORD, *ptr); + break; + } + } + // Adjust for getting to the start of the start character: + if (start_off) + moved += increment > 0 ? -start_off : start_off; + + LOG(spam, "complete_word: %s %d bytes", + (whitespace_elim ? "ws cut" + : (increment > 0 ? "appended" + : "prepended")), moved); + // Make sure pointer is correct as well: + ptr = orig_ptr + increment*moved; + moved = (whitespace_elim ? -moved : moved); + return moved; +} + + +SummaryDesc::highlight_desc::highlight_desc(off_t pos, + ssize_t len, bool highlight) + : _pos(pos), _len(len), _highlight(highlight) +{ + LOG(spam, "-- new desc: pos %ld len %ld %s", + _pos, _len, (highlight ? "(highlight)" : "")); + assert(pos >= 0); +} + + +SummaryDesc::SummaryDesc(Matcher* matcher, ssize_t length, ssize_t min_length, + int max_matches, int surround_len) + : _matcher(matcher), + _occ(matcher->OccurrenceList()), + _match_results(matcher->OrderedMatchSet()), + _length(length), + _min_length(min_length), + _remaining(length), + _surround_len(surround_len), + _est_len(0), + _hit_len(0), + _clist(), + _plist(), + _sumconf(), + _max_matches(max_matches), + _match_elems(), + _document_length(matcher->DocumentSize()), + _fulldoc() +{ + /* Check if the whole document fits within requested length and + * process this + */ + if (length + MIN_CONTINUATION*4 > (int)_document_length) { + build_fulldoc_desc(); + return; + } + + /* Adjust to sensible values */ + if (_surround_len < MIN_SURROUND_LEN) + _surround_len = MIN_SURROUND_LEN; + + /* decide what amount of matches to use (stored in _clist) */ + _match_elems = find_matches(); + + /* build highlight descriptor list */ + build_highlight_descs(); + + /* Done with matches list. Clean up */ + _clist.clear(); + + /* Spin through the resulting descriptor list and query term + * occurrence list to + * 1. identify (for highlight) accidental query term occurrences + * that are not part of the match + * 2. Split descriptor list where new keyword matches are + * found. Extend if necessary due to partially included keywords. + * 3. identify overlapping regions (possibly created by 2) + */ + locate_accidential_matches(); +} + + + +void SummaryDesc::locate_accidential_matches() +{ + key_occ_vector::const_iterator kit = _occ.begin(); + + for (print_list::iterator pit = _plist.begin(); + pit != _plist.end(); + ++pit) + { + highlight_desc* d = &(*pit); + + print_list::iterator nit = pit; + bool more = (++nit != _plist.end()); + + if (d->_highlight) + continue; // Ignore already found keywords.. + + /* Now investigate if there are other matches than the best + * ones that goes within the selected print context + */ + + /* Advance occurrence iterator until *kit (keyword occurrence) + * overlap d (current descriptor) or is past d + */ + while (kit != _occ.end() + && (*kit)->startpos() + (*kit)->tokenlen <= d->_pos) + ++kit; + + if (_matcher->UsesValid()) { + /* If there are subphrases or other restricting subqueries + * we must continue further on until we are past d or we + * have found an element that has the valid bit set + */ + while (kit != _occ.end() && + !(*kit)->valid() && + (*kit)->startpos() + (*kit)->tokenlen <= d->_pos + d->_len) + ++kit; + } + + if (kit == _occ.end()) + return; + + /* Turn "token cut at start" into "token contained in" case */ + if ((*kit)->startpos() < d->_pos) { + off_t offset = d->_pos - (*kit)->startpos(); + LOG(spam, "Convert start cut: offset %ld", offset); + d->_pos -= offset; + d->_len += offset; + } + + /* Split descriptors each time a new occurrence is found + * within current descriptor */ + for (; + kit != _occ.end() + && (*kit)->startpos() + (*kit)->tokenlen <= d->_pos + d->_len; + ++kit) + { + if (_matcher->UsesValid() && !(*kit)->valid()) + continue; + /* simple split - occurrence contained in (but maybe at + * start of) descriptor */ + off_t kpos = (*kit)->startpos(); + off_t klen = (*kit)->tokenlen; + off_t start_len = kpos - d->_pos; + off_t end_len = (d->_pos + d->_len) - (kpos + klen); + + LOG(spam, "Split: (%ld,%ld) (%ld, %ld) (%ld, %ld)", + d->_pos, start_len, + kpos, klen, + (kpos + klen), end_len); + + if (start_len > 0) + _plist.insert(pit, highlight_desc(d->_pos, start_len, false)); + + // new keyword + print_list::iterator kwit = + _plist.insert(pit, highlight_desc(kpos, klen, true)); + + if (end_len) { + LOG(spam, "-- Was: (%ld, %lu)", d->_pos, d->_len); + d->_pos = kpos + klen; + d->_len = end_len; + LOG(spam, "Modifying current to end (%ld, %lu)", + d->_pos, d->_len); + } else { + LOG(spam, "Erasing (%ld, %lu)", d->_pos, d->_len); + pit = _plist.erase(pit); + // Must ensure that d is valid (as the last descriptor seen) + // at top of loop and after end!! + d = &(*kwit); + } + } + if (kit == _occ.end()) + return; + + /* Handle cut end occurrence separately */ + off_t d_end = d->_pos + d->_len; + + if ((*kit)->startpos() < d_end + && (*kit)->startpos() + (*kit)->tokenlen > d_end) + { + off_t kpos = (*kit)->startpos(); + off_t klen = (*kit)->tokenlen; + off_t offset = (kpos + klen) - d_end; + + /* Detect if the next descriptor held part of this token */ + if (more) { + highlight_desc& nd = *nit; + if (nd._pos < kpos) { + LOG(spam, "(endsplit) Adjusting next desc %ld bytes", offset); + nd._pos += offset; + nd._len -= offset; + } + } + d->_len -= (klen - offset); + + LOG(spam, "[%ld] Endsplit: (%ld, %lu) (%ld, %ld)", + offset, d->_pos, d->_len, kpos, klen); + + /* Insert new desc after the just processed one */ + pit = _plist.insert(++pit, highlight_desc(kpos, klen, true)); + ++kit; + if (kit == _occ.end()) + return; + } + if (pit == _plist.end()) + break; + } // end for (pit..) +} + + +/* find a proper amount of matches */ + +int SummaryDesc::find_matches() +{ + int match_len = 0; + int match_count = 0; + int match_elems = 0; + int adjust_len = 0; + _est_len = 0; + + // Find enough proper matches (without overlap) + for (match_candidate_set::iterator it = _match_results.begin(); + it != _match_results.end(); + ++it) + { + MatchCandidate* m = (*it); + if (overlap(m)) + continue; + + ssize_t size = m->size(); + + assert(size >= 0); + m->make_keylist(); + keylist& klist = m->_klist; + assert(klist.size() > 0); + (void) klist; + + _clist.insert(m); + + /* Adjust length in case of lack of prefix context */ + int pre = m->starttoken() - m->ctxt_startpos(); + if (pre < _surround_len) + adjust_len += _surround_len - pre; + + match_len += size; + + if (LOG_WOULD_LOG(spam)) { + std::string s; m->dump(s); + LOG(spam, "MatchCandidate(%s) size %ld, tot.len %d", s.c_str(), size, match_len); + } + assert(match_len > 0); + match_count++; + match_elems += m->elems(); + + _est_len = match_len - adjust_len + + (2*(_surround_len)+MIN_CONTINUATION)*match_count; + if (_est_len >= (int)_min_length + && match_count >= _max_matches) + break; + } + LOG(spam, "QHL: %d matches, raw len %d, estimated len %d, elements %d", + match_count, match_len, _est_len, match_elems); + + // Quick estimate of the query word length + _hit_len = 5*match_elems; + return match_elems; +} + + +/** Check if a character is a configured connector character + */ +bool SummaryDesc::word_connector(const unsigned char* s) +{ + unsigned char c = *s; + if (c & 0x80) { + ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); + return (u <= 255 ? _sumconf->connector(u) : false); + } + return _sumconf->connector(c); +} + + +/* Move backwards/forwards from ptr (no longer than to start) in an + * UTF8 text until the beginning of an extended token or (if space, + * until beginning of the next/last word) + * A token in this function means some combination of words linked + * together with a single character of any of the configured set of + * legal connector characters. + * @return The number of bytes moved + */ +int SummaryDesc::complete_extended_token(unsigned char* start, ssize_t length, + const unsigned char*& ptr, off_t increment) +{ + int moved = 0; + const unsigned char *old_ptr = NULL; + for (;;) { + // Start by moving to the start/end of the word.. + moved += complete_word(start, length, ptr, increment); + + // Ensure that there is a quick way out of this at the end: + if (start >= ptr || start + length <= ptr || ptr == old_ptr) + return moved; + + // If we end up at the same place as last iteration, we need + // to bail (done above) to avoid an infinite loop. + old_ptr = ptr; + + // Store a pointer to the found break: + const unsigned char* preptr = ptr; + + int prelen; + // Position to previous/next character to check if this is a + // "real" break: + if (increment < 0) { + prelen = Fast_UnicodeUtil::UTF8move(start, length, + preptr, increment); + if (!prelen) + return moved; + } else { + prelen = 0; + } + + // Handle default case ("ordinary" space) + if (!word_connector(preptr)) { + LOG(spam, "Not a word connector case (%c)", *preptr); + return moved; + } + char wconn = *preptr; + (void) wconn; + LOG(spam, "Found word connector case candidate (%c)", wconn); + + // Read the character before/after the connector character: + int addlen = Fast_UnicodeUtil::UTF8move(start, length, + preptr, increment); + if (!addlen) + return moved; // Not possible to extend anything here + + // Only a single connector character that connects word + // characters should lead us to include more words in the + // normal sense: + if (!wordchar(preptr)) + return moved; + + // If a block of chinese data does not contain any spaces we have to return + // here in order to avoid searching all the way to the start/end. + return moved; + + // Ok, found a separator case, include another word.. + + moved += prelen + addlen; + // If going forward, the word completer will look at the + // previous char to see if we are at the start of a word, so + // we have to move forward once here: + if (increment > 0) { + addlen = Fast_UnicodeUtil::UTF8move(start, length, + preptr, increment); + if (!addlen) + return moved; + moved += addlen; + } + ptr = preptr; + + LOG(spam, "Found proper word connector case (%c,%c) yet moved %d", + wconn, *preptr, moved); + } +} + + + +/* Return a highlight tagged summary string from this summary + * description + */ +std::string SummaryDesc::get_summary(const char* buffer, size_t bytes, + const SummaryConfig* sumconf, + size_t& char_size) +{ + std::vector<char> s; + ssize_t prev_end = 0; + bool start_cont = false; // Set if this segment has been continued at the start + + LOG(debug, "start get_summary, substrings: %ld, est. length: %d", + _plist.size(), _est_len); + // Set the current summary config. Implies that get_summary is + // not MT safe wrt. this SummaryDesc (not a very heavy + // restriction..) + _sumconf = sumconf; + + juniper::Appender a(sumconf); + + int reserve_len = static_cast<int>(_est_len * 1.1); + if (reserve_len) + s.reserve(reserve_len); + print_list::iterator it = _plist.begin(); + print_list::iterator nit = it; + + /** Add continuation dots if not at the start of doc and not empty + * config */ + if (it != _plist.end() && (*it)._pos > 0) { + start_cont = true; + s.insert(s.end(), sumconf->dots().begin(), sumconf->dots().end()); + } + + /** Loop through all highlight_desc's in this SummaryDesc and + * build up the result string + */ + for (; it != _plist.end(); ++it) { + if (nit != _plist.end()) + ++nit; + highlight_desc& d = *it; + off_t next_pos = (nit == _plist.end() ? 0x7fffffff : (*nit)._pos); + + ssize_t len = d._len; + off_t pos = d._pos; + + if (pos < prev_end) { + // In spite of precautions keyword hits came so tight that + // we got ourselves an overlap after all. Just skip + // whatever needed.. + LOG(spam, "Overlap elim during string buildup: " + "previous end %ld, current pos %ld", + prev_end, pos); + if (pos + len <= prev_end) { + continue; + } else { + off_t adj_len = prev_end - pos; + pos = prev_end; + len -= adj_len; + } + } + + // Actual work on the string to present: + + if (prev_end > 0 && prev_end < pos) { + start_cont = true; + s.insert(s.end(), sumconf->dots().begin(), sumconf->dots().end()); + } + if (d._highlight) + s.insert(s.end(), sumconf->highlight_on().begin(), sumconf->highlight_on().end()); + + /* Point to current startpoint to check for split + * word/starting space tokens (only if previous segment is not + * adjacent!) + */ + const unsigned char* ptr = + reinterpret_cast<const unsigned char*>(&buffer[pos]); + if (!d._highlight && start_cont && prev_end < pos) { + // Complete beginning word by extending the prefix + unsigned char* b = + reinterpret_cast<unsigned char*>(const_cast<char*>(buffer)); + int moved = complete_extended_token(b, bytes, ptr, -1); + pos -= moved; + len += moved; + } else if (!d._highlight) { + LOG(spam, "Not completing word at " + "char %c/0x%x, prev_end %ld, pos %ld", + *ptr, *ptr, prev_end, pos); + } + + /* Point to "current" endpos to check for split word/ending + * space tokens but only in the cases where the next segment + * is not adjacent. + */ + ptr = reinterpret_cast<const unsigned char*>(&buffer[pos+len]); + if (!d._highlight && next_pos > pos + len && + pos + len < static_cast<ssize_t>(bytes)) { + int max_len = std::min(static_cast<off_t>(bytes), next_pos); + // complete word at the end (these strings are either + // ... in the start or the end or not at all, but overlap + // is taken care of in the next loop.. Complete end of + // word by appending at the end + unsigned char* b = + reinterpret_cast<unsigned char*>(const_cast<char*>(buffer)); + int moved = complete_extended_token(b, max_len, ptr, +1); + len += moved; + if ((pos + len) >= next_pos) { + LOG(spam, "Word completion: no space char found - " + "joining at pos %ld", next_pos); + } + } else if (!d._highlight) { + LOG(spam, "Not completing word at " + "char %c/0x%x, next_pos %ld", + *ptr, *ptr, next_pos); + } + + JD_INVAR(JD_DESC, len >= 0, len = 0, + LOG(error, + "get_summary: Invariant failed, len = %ld", + static_cast<long>(len))); + int add_len = ((int)bytes > len ? len : bytes); + + LOG(spam, "bytes %ld pos %ld len %ld %s", + bytes, pos, len, (d._highlight ? "(highlight)" : "")); + + a.append(s, &buffer[pos], add_len); + len -= add_len; + pos += add_len; + + if (d._highlight) { + s.insert(s.end(), sumconf->highlight_off().begin(), sumconf->highlight_off().end()); + } + prev_end = pos + len; + } + if (s.size() > 0 && prev_end < (int)_document_length) + s.insert(s.end(), sumconf->dots().begin(), sumconf->dots().end()); + LOG(debug, "get_summary: Length of summary %ld bytes %ld chars", + s.size(), a.charLen()); + _sumconf = NULL; // Not valid after this call. + char_size = a.charLen(); + return std::string(&s[0], s.size()); +} + + +bool SummaryDesc::overlap(MatchCandidate* m) +{ + // Walk through previous matches - exit if overlap + for (cand_list::iterator it = _clist.begin(); + it != _clist.end(); + ++it) + { + MatchCandidate *m1, *m2; + + if ((*it)->starttoken() < m->starttoken()) { + m1 = *it; + m2 = m; + } else { + m2 = *it; + m1 = m; + } + if (m1->endpos() > m2->starttoken()) { + LOG(spam, "overlap: [%ld, %ld] <-> [%ld, %ld]", + m->starttoken(), m->endpos(), + (*it)->starttoken(), (*it)->endpos()); + return true; + } + } + return false; +} + + +int SummaryDesc::recompute_estimate(int len_per_elem) +{ + int new_est = 0; + int affected_segments = 0; + _hit_len = 0; + + cand_list::iterator cit = _clist.begin(); + /* prefix */ + assert(cit != _clist.end()); + + bool prefix = true; + MatchCandidate* m = *cit; + off_t prev_pos = m->ctxt_startpos(); + + for (; cit != _clist.end(); ++cit) { + /* look at each keyword within match */ + m = *cit; + keylist& klist = (*cit)->_klist; + for (keylist::iterator kit = klist.begin(); + kit != klist.end(); + ++kit) + { + int seglen = (*kit)->startpos() - prev_pos; + if (seglen <= 0) { + LOG(spam, "recompute_estimate: Skipped additional match " + "at pos %lu", + (*kit)->startpos()); + continue; // skip multiple matches of same occurrence + } + _hit_len += (*kit)->tokenlen; + if (prefix) { + // Only fit one elem at start + if (len_per_elem < seglen) { + affected_segments++; + LOG(spam, "recompute_estimate prefix " + "(dist %d): len %d (affected)", + seglen, len_per_elem); + seglen = len_per_elem; + } else { + LOG(spam, "recompute_estimate: prefix len %d", + seglen); + } + prefix = false; + } else if ((len_per_elem << 1) < seglen) { + affected_segments +=2; + LOG(spam, "recompute_estimate(dist %d): " + "len %d (affected*2)", + seglen, len_per_elem*2 + MIN_CONTINUATION); + seglen = len_per_elem * 2 + MIN_CONTINUATION; + } else { + LOG(spam, "recompute_estimate: mid len %d", + seglen); + } + new_est += seglen; + prev_pos = (*kit)->startpos() + (*kit)->tokenlen; + } + } + + /* postfix */ + int xlen = _matcher->DocumentSize() - m->endpos(); + if (xlen < len_per_elem) { + new_est += xlen; + LOG(spam, "recompute_estimate: end len %d", xlen); + } else { + affected_segments++; + LOG(spam, "recompute_estimate: end len %d (affected)", + len_per_elem); + new_est += len_per_elem; + } + + LOG(spam, "recompute_estimate(%d): %d -> %d, affected %d", + len_per_elem, _est_len, new_est, affected_segments); + _est_len = new_est; + + /* Re-set available print length per element (prefix or postfix) */ + len_per_elem = (_length - _hit_len) / (_match_elems*2); + + // Adjust element length to sensible values + len_per_elem = std::max(MIN_SURROUND_LEN, len_per_elem); + + LOG(spam, "recompute_estimate --> %d", len_per_elem); + + if (affected_segments > 0 && _length > _est_len + MIN_SURROUND_LEN) { + int adj = (_length - _hit_len + - (_est_len + MIN_SURROUND_LEN)) / affected_segments; + + // Again re-adjust element length to sensible values + if (len_per_elem + adj < MIN_SURROUND_LEN) { + LOG(spam, "recompute_estimate(%d) " + "(below MIN_SURROUND_LEN threshold)", + len_per_elem); + adj = (MIN_SURROUND_LEN - len_per_elem); + len_per_elem = MIN_SURROUND_LEN; + } else { + len_per_elem += adj; + } + _est_len += adj * affected_segments; + LOG(spam, "recompute_estimate (adj %d) el.len %d new est_len %d", + adj, len_per_elem, _est_len); + } + return len_per_elem; +} + +void SummaryDesc::build_highlight_descs() +{ + /* Set available print length per element (prefix or postfix) */ + int len_per_elem; + + if (_est_len > (int)_length) { + len_per_elem = (_length - _hit_len) / (_match_elems*2); + + // Adjust element length to sensible values + len_per_elem = std::max(MIN_SURROUND_LEN, len_per_elem); + + /* Check that this does not yield a too long/too short teaser */ + len_per_elem = recompute_estimate(len_per_elem); + } else { + len_per_elem = _surround_len; + } + + // Max length to allow before a split is required: Note that we + // allow an extra MIN_CONTINUATION extra bytes to the total those + // times where matches are close + int middle_len = len_per_elem * 2 + MIN_CONTINUATION; + + int len = len_per_elem; // Max running length to update pointer with + + LOG(spam, "length pr. elem %d", len_per_elem); + + /* build the ordered highlight description list (stored in _plist) + * based on our collected info about the best matches available + * and the estimated length (len_per_elem) of a triple + * (pre-context, highlight keyword, post-context) (len_per_elem + * assumes no overlap). Identify a line segment at a time.. + */ + + off_t pos = 0; + off_t startpos = 0; + + for (cand_list::iterator cit = _clist.begin(); + cit != _clist.end(); + ++cit) + { + /* look at each keyword within match */ + keylist& klist = (*cit)->_klist; + + for (keylist::iterator kit = klist.begin(); + kit != klist.end(); + ++kit) + { + key_occ* k = *kit; + int max_len = k->startpos() - pos; + // the same occurrence may appear twice in a match, in + // which case length will be < 0 + if (max_len < 0) + continue; + + if (pos == 0) { + // Adding initial segment: + if (len < max_len) { + startpos = pos = max_len - len; + } else { + len = max_len; + } + add_desc(pos, len, false); + } else if (max_len <= middle_len) { + // Context in between fits completely + len = max_len; + add_desc(pos, len, false); + } else { + if (LOG_WOULD_LOG(spam)) { + int dist = (k->startpos() - len_per_elem) - (pos + len_per_elem); + LOG(spam, "Middle split case, distance: %d", dist); + } + len = max_len; + add_desc(pos, len_per_elem, false); + add_desc(k->startpos() - len_per_elem, len_per_elem, false); + } + // Finally add the keyword itself: + add_desc(k->startpos(), k->tokenlen, true); + pos += (k->tokenlen + len); + } + } + + if (pos > 0) { + // Adding final segment, ensure that there is enough text available.. + int max_len = std::min(len_per_elem, + static_cast<int>(_matcher->DocumentSize() - pos)); + add_desc(pos, max_len, false); + } + LOG(debug, "Summary: start %ld end: %ld", startpos, pos); +} + + +/* create description for the complete document */ + +void SummaryDesc::build_fulldoc_desc() +{ + LOG(debug, "Generating query highlights for complete document"); + off_t pos = 0; + for (key_occ_vector::const_iterator kit = _occ.begin(); + kit != _occ.end(); ++kit) + { + int klen = (*kit)->tokenlen; + int kpos = (*kit)->startpos(); + add_desc(pos, kpos - pos, false); + // Use valid() info to filter out non-phrase terms if this is + // a phrase search: + add_desc(kpos, klen, (!_matcher->UsesValid()) || (*kit)->valid()); + pos = kpos + klen; + } + add_desc(pos, _matcher->DocumentSize() - pos, false); + _est_len = _matcher->DocumentSize(); +} + + +void SummaryDesc::add_desc(off_t pos, ssize_t len, bool highlight) +{ + if (len == 0) + return; + JD_INVAR(JD_DUMP, len > 0, return, + LOG(info, "add_desc len %ld, %s", static_cast<long>(len), + (highlight ? "highlight" : "")); assert(false)); + _plist.push_back(highlight_desc(pos, len, highlight)); +} diff --git a/juniper/src/vespa/juniper/sumdesc.h b/juniper/src/vespa/juniper/sumdesc.h new file mode 100644 index 00000000000..2ad1574ea5a --- /dev/null +++ b/juniper/src/vespa/juniper/sumdesc.h @@ -0,0 +1,108 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +/* $Id$ */ + +#include <list> +#include "juniperdebug.h" +#include "mcand.h" +#define _NEED_SUMMARY_CONFIG_IMPL 1 +#include "SummaryConfig.h" + +/** A class of objects describing a query highlight dynamic summary based on a + * given state of the matcher. + * This module defines the teaser appearance given the matches as input. + */ + +/* The minimal distance to introduce a continuation symbol for */ +#define MIN_CONTINUATION 8 + +/* The minimal surround length to ever set */ +#define MIN_SURROUND_LEN 10 + +/* Allow word split if word longer than this */ +#define MAX_SCAN_WORD 0x40 + +class Matcher; +class IDocumentFeeder; + +class SummaryDesc +{ +public: + // Constructor that builds a description that can later be used to create + // a suitable query in context / query highlight for the given matcher + // in its current status: + SummaryDesc(Matcher* matcher, ssize_t length, ssize_t min_length, int max_matches, + int surround_len); + + /* Return a highlight tagged summary string + * from this summary description + */ + std::string get_summary(IDocumentFeeder* feeder, SummaryConfig* sumconf, size_t& char_size); + + /* Return a highlight tagged summary string + * from this summary description + */ + std::string get_summary(const char* buffer, size_t len, + const SummaryConfig* sumconf, size_t& char_size); + +protected: + + /** A simple object that describes the contiguous elements of the generated summary + */ + class highlight_desc + { + public: + highlight_desc(off_t pos, ssize_t len, bool highlight); + off_t _pos; /* Start pos of item within document */ + ssize_t _len; /* Length of print item */ + bool _highlight; /* Whether to highlight item or not */ + }; + + void add_desc(off_t pos, ssize_t len, bool highlight); + + typedef JUNIPER_SET<MatchCandidate*,sequential_elem<MatchCandidate*> > cand_list; + typedef std::list<highlight_desc> print_list; + + /** Helper function to build a simple query highlight of the complete document */ + void build_fulldoc_desc(); + + /** Helper functions to build a dynamic teaser extract */ + int find_matches(); + int recompute_estimate(int len_per_elem); + void build_highlight_descs(); + void locate_accidential_matches(); + + bool overlap(MatchCandidate* m); + bool word_connector(const unsigned char* s); + int complete_extended_token(unsigned char* start, ssize_t length, + const unsigned char*& ptr, off_t increment); + +private: + /* desired net printout length */ + Matcher* _matcher; + const key_occ_vector& _occ; // Reference to the matcher's occurrence list + /* Reference to the matchers ordered set of matches (match result set) */ + match_candidate_set& _match_results; + ssize_t _length; // desired length of the generated summary + ssize_t _min_length; // desired minimum length of the generated summary + int _remaining; // What's left to generate + int _surround_len; // how much context to put around + int _est_len; // Estimated length of the generated summary + int _hit_len; // Estimated/computed total length of all query hit terms + + /* Temporary sequentially ordered match list used during computation */ + cand_list _clist; + /* The resulting list of print descriptions */ + print_list _plist; + const SummaryConfig* _sumconf; // The current config from a running get_summary call + int _max_matches; // The maximal number of matches to try as long as within _min_length + int _match_elems; // Total number of keywords found in matches + size_t _document_length; // Length of original document + bool _fulldoc; // Set if requesting a full document (to avoid cuts) + + SummaryDesc(SummaryDesc &); + SummaryDesc &operator=(SummaryDesc &); +}; + + diff --git a/juniper/src/vespa/juniper/tokenizer.cpp b/juniper/src/vespa/juniper/tokenizer.cpp new file mode 100644 index 00000000000..262c0046fb3 --- /dev/null +++ b/juniper/src/vespa/juniper/tokenizer.cpp @@ -0,0 +1,68 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".juniper.tokenizer"); +#include <vespa/fastlib/text/wordfolder.h> +#include "tokenizer.h" +#include "juniperdebug.h" + +JuniperTokenizer::JuniperTokenizer(Fast_WordFolder* wordfolder, + const char* text, size_t len, ITokenProcessor* successor, + const juniper::SpecialTokenRegistry * registry) : + _wordfolder(wordfolder), _text(text), _len(len), _successor(successor), _registry(registry), + _charpos(0), _wordpos(0) +{ +} + + +void JuniperTokenizer::SetText(const char* text, size_t len) +{ + _text = text; + _len = len; + _charpos = 0; + _wordpos = 0; +} + + +// Scan the input and dispatch to the successor +void JuniperTokenizer::scan() +{ + ITokenProcessor::Token token; + + const char* src = _text; + const char* src_end = _text + _len; + const char* startpos = NULL; + ucs4_t* dst = _buffer; + ucs4_t* dst_end = dst + TOKEN_DSTLEN; + size_t result_len; + + while (src < src_end) + { + if (_registry == NULL) { + // explicit prefetching seems to have negative effect with many threads + // FastOS_Prefetch::NT(const_cast<void *>((const void *)(src + 32))); + src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len); + } else { + const char * tmpSrc = _registry->tokenize(src, src_end, dst, dst_end, startpos, result_len); + if (tmpSrc == NULL) { + src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len); + } else { + src = tmpSrc; + } + } + if (dst[0] == 0) break; + token.curlen = result_len; + token.token = dst; + token.wordpos = _wordpos++; + token.bytepos = startpos - _text; + token.bytelen = src - startpos; + LOG(debug, "curlen %d, bytepos %ld, bytelen %d", + token.curlen, token.bytepos, token.bytelen); + // NB! not setting charlen/charpos/_utf8pos/_utf8len yet...! + _successor->handle_token(token); + } + token.bytepos = _len; + token.bytelen = 0; + token.token = NULL; + _successor->handle_end(token); +} diff --git a/juniper/src/vespa/juniper/tokenizer.h b/juniper/src/vespa/juniper/tokenizer.h new file mode 100644 index 00000000000..e5c6f3d139e --- /dev/null +++ b/juniper/src/vespa/juniper/tokenizer.h @@ -0,0 +1,38 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "specialtokenregistry.h" +#include "ITokenProcessor.h" + +class Fast_WordFolder; + +#define TOKEN_DSTLEN 1024 + +class JuniperTokenizer +{ +public: + JuniperTokenizer(Fast_WordFolder* wordfolder, + const char* text, size_t len, ITokenProcessor* = NULL, + const juniper::SpecialTokenRegistry * registry = NULL); + inline void SetSuccessor(ITokenProcessor* successor) { _successor = successor; } + void setRegistry(const juniper::SpecialTokenRegistry * registry) { _registry = registry; } + + void SetText(const char* text, size_t len); + + // Scan the input and dispatch to the successor + void scan(); +private: + Fast_WordFolder* _wordfolder; + const char* _text; // The current input text + size_t _len; // Length of the text input + ITokenProcessor* _successor; + const juniper::SpecialTokenRegistry * _registry; + off_t _charpos; // Last utf8 character position + off_t _wordpos; // Offset in numbering of words compared to input (as result of splits) + ucs4_t _buffer[TOKEN_DSTLEN]; // Temp. buffer to store folding result +private: + JuniperTokenizer(const JuniperTokenizer&); + JuniperTokenizer& operator=(const JuniperTokenizer&); +}; + + |