aboutsummaryrefslogtreecommitdiffstats
path: root/juniper/src/vespa/juniper/appender.h
blob: ade199e00c2d18fe96d5c51fb87827bb27e37536 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once

#include <vespa/vespalib/util/hdr_abort.h>

namespace juniper {

class Appender
{
private:
    const SummaryConfig *_sumconf;
    bool                 _escape_markup;
    bool                 _preserve_white_space;
    bool                 _last_was_space;
    size_t               _char_len;

    inline void append(std::vector<char> & s, char c) {
        JD_INVAR(JD_INPUT, c != 0, return,\
                 LOG(warning, "Document source contained 0-bytes"));
        // eliminate separators:
        if (_sumconf->separator(c)) {
            return;
        }

        // eliminate multiple space characters
        if (!_preserve_white_space) {
            if (c > 0 && isspace(c)) {
                if (_last_was_space) {
                    return;
                } else {
                    _last_was_space = true;
                }
                c = ' '; // Never output newline or tab
            } else {
                _last_was_space = false;
            }
        }

        bool handled_as_markup;
        if (_escape_markup) {
            handled_as_markup = true;
            switch (c) {
            case '<':
                s.push_back('&');
                s.push_back('l');
                s.push_back('t');
                s.push_back(';');
                break;
            case '>':
                s.push_back('&');
                s.push_back('g');
                s.push_back('t');
                s.push_back(';');
                break;
            case '"':
                s.push_back('&');
                s.push_back('q');
                s.push_back('u');
                s.push_back('o');
                s.push_back('t');
                s.push_back(';');
                break;
            case '&':
                s.push_back('&');
                s.push_back('a');
                s.push_back('m');
                s.push_back('p');
                s.push_back(';');
                break;
            case '\'':
                s.push_back('&');
                s.push_back('#');
                s.push_back('3');
                s.push_back('9');
                s.push_back(';');
                break;
            default:
                handled_as_markup = false;
                break;
            }
            if (handled_as_markup) {
                _char_len++;
            }
        } else {
            handled_as_markup = false;
        }

        if (!handled_as_markup) {
            s.push_back(c);
            /** If at start of an UTF8 character (both highest bits or none of them set)
             *  another char is accumulated..
             */
            if (!(c & 0x80) || (c & 0x40) ) {
                _char_len++;
            }
        }
    }

public:
    Appender(const SummaryConfig *sumconf)
        : _sumconf(sumconf),
          _escape_markup(false),
          _preserve_white_space(false),
          _last_was_space(false),
          _char_len(0)
    {
        ConfigFlag esc_conf = _sumconf->escape_markup();

        switch (esc_conf) {
        case CF_OFF:
            _escape_markup = false;
            break;
        case CF_ON:
            _escape_markup = true;
            break;
        case CF_AUTO:
            _escape_markup = (_sumconf->highlight_on()[0] == '<' ||
                              _sumconf->highlight_off()[0] == '<' ||
                              _sumconf->dots()[0] == '<');
            break;
        }

        if (_sumconf->preserve_white_space() == CF_ON) {
            _preserve_white_space = true;
        }
    }

    size_t charLen() const { return _char_len; }

    void append(std::vector<char>& s, const char* ds, int length) {
        for (int i = 0; i < length; i++) {
            append(s, ds[i]);
        }
    }
};

} // end namespace juniper