1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
/* $Id$ */
#include <list>
#include "juniperdebug.h"
#include "mcand.h"
#define _NEED_SUMMARY_CONFIG_IMPL 1
#include "SummaryConfig.h"
/** A class of objects describing a query highlight dynamic summary based on a
* given state of the matcher.
* This module defines the teaser appearance given the matches as input.
*/
/* The minimal distance to introduce a continuation symbol for */
#define MIN_CONTINUATION 8
/* The minimal surround length to ever set */
#define MIN_SURROUND_LEN 10
/* Allow word split if word longer than this */
#define MAX_SCAN_WORD 0x40
class Matcher;
class IDocumentFeeder;
class SummaryDesc
{
public:
// Constructor that builds a description that can later be used to create
// a suitable query in context / query highlight for the given matcher
// in its current status:
SummaryDesc(Matcher* matcher, ssize_t length, ssize_t min_length, int max_matches, int surround_len);
~SummaryDesc();
/* Return a highlight tagged summary string
* from this summary description
*/
std::string get_summary(IDocumentFeeder* feeder, SummaryConfig* sumconf, size_t& char_size);
/* Return a highlight tagged summary string
* from this summary description
*/
std::string get_summary(const char* buffer, size_t len,
const SummaryConfig* sumconf, size_t& char_size);
protected:
/** A simple object that describes the contiguous elements of the generated summary
*/
class highlight_desc
{
public:
highlight_desc(off_t pos, ssize_t len, bool highlight);
off_t _pos; /* Start pos of item within document */
ssize_t _len; /* Length of print item */
bool _highlight; /* Whether to highlight item or not */
};
void add_desc(off_t pos, ssize_t len, bool highlight);
using cand_list = std::set<MatchCandidate*,sequential_elem<MatchCandidate*> >;
using print_list = std::list<highlight_desc>;
/** Helper function to build a simple query highlight of the complete document */
void build_fulldoc_desc();
/** Helper functions to build a dynamic teaser extract */
int find_matches();
int recompute_estimate(int len_per_elem);
void build_highlight_descs();
void locate_accidential_matches();
bool overlap(MatchCandidate* m);
bool word_connector(const unsigned char* s);
int complete_extended_token(unsigned char* start, ssize_t length,
const unsigned char*& ptr, off_t increment);
private:
/* desired net printout length */
Matcher* _matcher;
const key_occ_vector& _occ; // Reference to the matcher's occurrence list
/* Reference to the matchers ordered set of matches (match result set) */
match_candidate_set& _match_results;
ssize_t _length; // desired length of the generated summary
ssize_t _min_length; // desired minimum length of the generated summary
int _remaining; // What's left to generate
int _surround_len; // how much context to put around
int _est_len; // Estimated length of the generated summary
int _hit_len; // Estimated/computed total length of all query hit terms
/* Temporary sequentially ordered match list used during computation */
cand_list _clist;
/* The resulting list of print descriptions */
print_list _plist;
const SummaryConfig* _sumconf; // The current config from a running get_summary call
int _max_matches; // The maximal number of matches to try as long as within _min_length
int _match_elems; // Total number of keywords found in matches
size_t _document_length; // Length of original document
bool _fulldoc; // Set if requesting a full document (to avoid cuts)
SummaryDesc(SummaryDesc &);
SummaryDesc &operator=(SummaryDesc &);
};
|