searchsummary/src/vespa/juniper/dpinterface.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#pragma once

#include <cstddef>
#include <cstdio>

/** @file dpinterface.h This file is the main include file for inetgrators of the document
 *    processing/indexing stages of Juniper specific processing. For integrating
 *    result processing (core Juniper - highlighting/proximity metric computation) refer to
 *    rpinterface.h
 */

namespace juniper {

/** class Tokentype Hint as to which type of token this is.
 *  If this information is already aggregated by the caller
 *  it allows us to save som extra computation in Juniper.
 */
enum Tokentype
{
    TOKEN_UNKNOWN,  // token type info not present.
    TOKEN_WORD,     // This is a word token
    TOKEN_SEP,      // This is a separator token
    TOKEN_MARKUP,   // This token contains general unspecified markup
    TOKEN_OTHER,    // This token is something else than any of the above
    TOKEN_MAX       // Max token types currently supported.
};

/** Opaque reference to the Juniper internal representation of a document summary
 *  Allows transport of Juniper information between different stages of the
 *  document processing without having to serialize/deserialize for each such step.
 */
class Docsum;

/** @class DocsumProcessor
 *  Interface for Document processors specific for and
 *  implemented in Juniper.
 *  that operate on doc summaries (at proper places in the document processing pipelines)
 *  to enhance and annotate the source for Juniper result processing (see rpinterface.h)
 */
class DocsumProcessor
{
public:
    virtual ~DocsumProcessor() {}

    /** Process a docsum with this processor. Processing can in the cases where
     *  token based processing is necessary just be implemented as setting
     *  the document summary to do processing for, but can also yield a complete
     *  processing.
     *  @param docsum_input a previously serialized Docsum object or an UTF-8 string
     *  @param length Length in bytes of the docsum_input object
     *  @return false if the operation failed, true otherwise
     */
    virtual bool Process(const char* docsum_input, size_t length) = 0;

    /** Process a docsum with this processor
     *  @param docsum an input Docsum to process. This DocsumProcessor
     *    also takes responsibility for releasing the Docsum object if necessary, that is
     *    GetDocsum has not been called when this object is deleted,
     *    the Docsum gets released as well.
     *  @return false if the operation failed, true otherwise
     */
    virtual bool Process(Docsum* docsum) = 0;

    /** Low level document processing
     *  @param rep A textual representation of the token to process
     *  @param start The start position of this token within the original text
     *  @param len Length of the token representation
     *  @param type The token type in question (to allow saving of
     *     processing time in Juniper)
     *  @return true if operation ok, false if failure to process
     */
    virtual bool ProcessToken(const char* rep, off_t start, size_t len, Tokentype type) = 0;

    /** Retrieve a reference to the docsum representation
     *  @return The Docsum object including the current state of the docsum.
     *    This Docsum object must later be released by the caller using ReleaseDocsum
     *    or handed over to a subsequent processor.
     */
    virtual Docsum* GetDocsum() = 0;

    /** Create a textual representation of the annotated docsum suitable for disk storage
     *  for later usage by Juniper result processing.
     * @param length The length of the serialized docsum
     * @return A pointer to the text representation of the docsum. This object
     *    is valid throughout the life of this document processor or until
     *    the next call to Serialize() for this processor.
     */
    virtual const char* Serialize(size_t& length) = 0;
};

void ReleaseDocsum(Docsum* docsum);

} // end namespace juniper