Statistics
| Branch: | Revision:

root / json-lexer.c @ b011f619

History | View | Annotate | Download (9.2 kB)

1
/*
2
 * JSON lexer
3
 *
4
 * Copyright IBM, Corp. 2009
5
 *
6
 * Authors:
7
 *  Anthony Liguori   <aliguori@us.ibm.com>
8
 *
9
 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
10
 * See the COPYING.LIB file in the top-level directory.
11
 *
12
 */
13

    
14
#include "qstring.h"
15
#include "qlist.h"
16
#include "qdict.h"
17
#include "qint.h"
18
#include "qemu-common.h"
19
#include "json-lexer.h"
20

    
21
#define MAX_TOKEN_SIZE (64ULL << 20)
22

    
23
/*
24
 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
25
 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
26
 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
27
 * [{}\[\],:]
28
 * [a-z]+
29
 *
30
 */
31

    
32
enum json_lexer_state {
33
    IN_ERROR = 0,
34
    IN_DQ_UCODE3,
35
    IN_DQ_UCODE2,
36
    IN_DQ_UCODE1,
37
    IN_DQ_UCODE0,
38
    IN_DQ_STRING_ESCAPE,
39
    IN_DQ_STRING,
40
    IN_SQ_UCODE3,
41
    IN_SQ_UCODE2,
42
    IN_SQ_UCODE1,
43
    IN_SQ_UCODE0,
44
    IN_SQ_STRING_ESCAPE,
45
    IN_SQ_STRING,
46
    IN_ZERO,
47
    IN_DIGITS,
48
    IN_DIGIT,
49
    IN_EXP_E,
50
    IN_MANTISSA,
51
    IN_MANTISSA_DIGITS,
52
    IN_NONZERO_NUMBER,
53
    IN_NEG_NONZERO_NUMBER,
54
    IN_KEYWORD,
55
    IN_ESCAPE,
56
    IN_ESCAPE_L,
57
    IN_ESCAPE_LL,
58
    IN_ESCAPE_I,
59
    IN_ESCAPE_I6,
60
    IN_ESCAPE_I64,
61
    IN_WHITESPACE,
62
    IN_START,
63
};
64

    
65
#define TERMINAL(state) [0 ... 0x7F] = (state)
66

    
67
/* Return whether TERMINAL is a terminal state and the transition to it
68
   from OLD_STATE required lookahead.  This happens whenever the table
69
   below uses the TERMINAL macro.  */
70
#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
71
            (json_lexer[(old_state)][0] == (terminal))
72

    
73
static const uint8_t json_lexer[][256] =  {
74
    /* double quote string */
75
    [IN_DQ_UCODE3] = {
76
        ['0' ... '9'] = IN_DQ_STRING,
77
        ['a' ... 'f'] = IN_DQ_STRING,
78
        ['A' ... 'F'] = IN_DQ_STRING,
79
    },
80
    [IN_DQ_UCODE2] = {
81
        ['0' ... '9'] = IN_DQ_UCODE3,
82
        ['a' ... 'f'] = IN_DQ_UCODE3,
83
        ['A' ... 'F'] = IN_DQ_UCODE3,
84
    },
85
    [IN_DQ_UCODE1] = {
86
        ['0' ... '9'] = IN_DQ_UCODE2,
87
        ['a' ... 'f'] = IN_DQ_UCODE2,
88
        ['A' ... 'F'] = IN_DQ_UCODE2,
89
    },
90
    [IN_DQ_UCODE0] = {
91
        ['0' ... '9'] = IN_DQ_UCODE1,
92
        ['a' ... 'f'] = IN_DQ_UCODE1,
93
        ['A' ... 'F'] = IN_DQ_UCODE1,
94
    },
95
    [IN_DQ_STRING_ESCAPE] = {
96
        ['b'] = IN_DQ_STRING,
97
        ['f'] =  IN_DQ_STRING,
98
        ['n'] =  IN_DQ_STRING,
99
        ['r'] =  IN_DQ_STRING,
100
        ['t'] =  IN_DQ_STRING,
101
        ['/'] = IN_DQ_STRING,
102
        ['\\'] = IN_DQ_STRING,
103
        ['\''] = IN_DQ_STRING,
104
        ['\"'] = IN_DQ_STRING,
105
        ['u'] = IN_DQ_UCODE0,
106
    },
107
    [IN_DQ_STRING] = {
108
        [1 ... 0xBF] = IN_DQ_STRING,
109
        [0xC2 ... 0xF4] = IN_DQ_STRING,
110
        ['\\'] = IN_DQ_STRING_ESCAPE,
111
        ['"'] = JSON_STRING,
112
    },
113

    
114
    /* single quote string */
115
    [IN_SQ_UCODE3] = {
116
        ['0' ... '9'] = IN_SQ_STRING,
117
        ['a' ... 'f'] = IN_SQ_STRING,
118
        ['A' ... 'F'] = IN_SQ_STRING,
119
    },
120
    [IN_SQ_UCODE2] = {
121
        ['0' ... '9'] = IN_SQ_UCODE3,
122
        ['a' ... 'f'] = IN_SQ_UCODE3,
123
        ['A' ... 'F'] = IN_SQ_UCODE3,
124
    },
125
    [IN_SQ_UCODE1] = {
126
        ['0' ... '9'] = IN_SQ_UCODE2,
127
        ['a' ... 'f'] = IN_SQ_UCODE2,
128
        ['A' ... 'F'] = IN_SQ_UCODE2,
129
    },
130
    [IN_SQ_UCODE0] = {
131
        ['0' ... '9'] = IN_SQ_UCODE1,
132
        ['a' ... 'f'] = IN_SQ_UCODE1,
133
        ['A' ... 'F'] = IN_SQ_UCODE1,
134
    },
135
    [IN_SQ_STRING_ESCAPE] = {
136
        ['b'] = IN_SQ_STRING,
137
        ['f'] =  IN_SQ_STRING,
138
        ['n'] =  IN_SQ_STRING,
139
        ['r'] =  IN_SQ_STRING,
140
        ['t'] =  IN_SQ_STRING,
141
        ['/'] = IN_DQ_STRING,
142
        ['\\'] = IN_DQ_STRING,
143
        ['\''] = IN_SQ_STRING,
144
        ['\"'] = IN_SQ_STRING,
145
        ['u'] = IN_SQ_UCODE0,
146
    },
147
    [IN_SQ_STRING] = {
148
        [1 ... 0xBF] = IN_SQ_STRING,
149
        [0xC2 ... 0xF4] = IN_SQ_STRING,
150
        ['\\'] = IN_SQ_STRING_ESCAPE,
151
        ['\''] = JSON_STRING,
152
    },
153

    
154
    /* Zero */
155
    [IN_ZERO] = {
156
        TERMINAL(JSON_INTEGER),
157
        ['0' ... '9'] = IN_ERROR,
158
        ['.'] = IN_MANTISSA,
159
    },
160

    
161
    /* Float */
162
    [IN_DIGITS] = {
163
        TERMINAL(JSON_FLOAT),
164
        ['0' ... '9'] = IN_DIGITS,
165
    },
166

    
167
    [IN_DIGIT] = {
168
        ['0' ... '9'] = IN_DIGITS,
169
    },
170

    
171
    [IN_EXP_E] = {
172
        ['-'] = IN_DIGIT,
173
        ['+'] = IN_DIGIT,
174
        ['0' ... '9'] = IN_DIGITS,
175
    },
176

    
177
    [IN_MANTISSA_DIGITS] = {
178
        TERMINAL(JSON_FLOAT),
179
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
180
        ['e'] = IN_EXP_E,
181
        ['E'] = IN_EXP_E,
182
    },
183

    
184
    [IN_MANTISSA] = {
185
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
186
    },
187

    
188
    /* Number */
189
    [IN_NONZERO_NUMBER] = {
190
        TERMINAL(JSON_INTEGER),
191
        ['0' ... '9'] = IN_NONZERO_NUMBER,
192
        ['e'] = IN_EXP_E,
193
        ['E'] = IN_EXP_E,
194
        ['.'] = IN_MANTISSA,
195
    },
196

    
197
    [IN_NEG_NONZERO_NUMBER] = {
198
        ['0'] = IN_ZERO,
199
        ['1' ... '9'] = IN_NONZERO_NUMBER,
200
    },
201

    
202
    /* keywords */
203
    [IN_KEYWORD] = {
204
        TERMINAL(JSON_KEYWORD),
205
        ['a' ... 'z'] = IN_KEYWORD,
206
    },
207

    
208
    /* whitespace */
209
    [IN_WHITESPACE] = {
210
        TERMINAL(JSON_SKIP),
211
        [' '] = IN_WHITESPACE,
212
        ['\t'] = IN_WHITESPACE,
213
        ['\r'] = IN_WHITESPACE,
214
        ['\n'] = IN_WHITESPACE,
215
    },        
216

    
217
    /* escape */
218
    [IN_ESCAPE_LL] = {
219
        ['d'] = JSON_ESCAPE,
220
    },
221

    
222
    [IN_ESCAPE_L] = {
223
        ['d'] = JSON_ESCAPE,
224
        ['l'] = IN_ESCAPE_LL,
225
    },
226

    
227
    [IN_ESCAPE_I64] = {
228
        ['d'] = JSON_ESCAPE,
229
    },
230

    
231
    [IN_ESCAPE_I6] = {
232
        ['4'] = IN_ESCAPE_I64,
233
    },
234

    
235
    [IN_ESCAPE_I] = {
236
        ['6'] = IN_ESCAPE_I6,
237
    },
238

    
239
    [IN_ESCAPE] = {
240
        ['d'] = JSON_ESCAPE,
241
        ['i'] = JSON_ESCAPE,
242
        ['p'] = JSON_ESCAPE,
243
        ['s'] = JSON_ESCAPE,
244
        ['f'] = JSON_ESCAPE,
245
        ['l'] = IN_ESCAPE_L,
246
        ['I'] = IN_ESCAPE_I,
247
    },
248

    
249
    /* top level rule */
250
    [IN_START] = {
251
        ['"'] = IN_DQ_STRING,
252
        ['\''] = IN_SQ_STRING,
253
        ['0'] = IN_ZERO,
254
        ['1' ... '9'] = IN_NONZERO_NUMBER,
255
        ['-'] = IN_NEG_NONZERO_NUMBER,
256
        ['{'] = JSON_OPERATOR,
257
        ['}'] = JSON_OPERATOR,
258
        ['['] = JSON_OPERATOR,
259
        [']'] = JSON_OPERATOR,
260
        [','] = JSON_OPERATOR,
261
        [':'] = JSON_OPERATOR,
262
        ['a' ... 'z'] = IN_KEYWORD,
263
        ['%'] = IN_ESCAPE,
264
        [' '] = IN_WHITESPACE,
265
        ['\t'] = IN_WHITESPACE,
266
        ['\r'] = IN_WHITESPACE,
267
        ['\n'] = IN_WHITESPACE,
268
    },
269
};
270

    
271
void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
272
{
273
    lexer->emit = func;
274
    lexer->state = IN_START;
275
    lexer->token = qstring_new();
276
    lexer->x = lexer->y = 0;
277
}
278

    
279
static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
280
{
281
    int char_consumed, new_state;
282

    
283
    lexer->x++;
284
    if (ch == '\n') {
285
        lexer->x = 0;
286
        lexer->y++;
287
    }
288

    
289
    do {
290
        new_state = json_lexer[lexer->state][(uint8_t)ch];
291
        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
292
        if (char_consumed) {
293
            qstring_append_chr(lexer->token, ch);
294
        }
295

    
296
        switch (new_state) {
297
        case JSON_OPERATOR:
298
        case JSON_ESCAPE:
299
        case JSON_INTEGER:
300
        case JSON_FLOAT:
301
        case JSON_KEYWORD:
302
        case JSON_STRING:
303
            lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
304
        case JSON_SKIP:
305
            QDECREF(lexer->token);
306
            lexer->token = qstring_new();
307
            new_state = IN_START;
308
            break;
309
        case IN_ERROR:
310
            /* XXX: To avoid having previous bad input leaving the parser in an
311
             * unresponsive state where we consume unpredictable amounts of
312
             * subsequent "good" input, percolate this error state up to the
313
             * tokenizer/parser by forcing a NULL object to be emitted, then
314
             * reset state.
315
             *
316
             * Also note that this handling is required for reliable channel
317
             * negotiation between QMP and the guest agent, since chr(0xFF)
318
             * is placed at the beginning of certain events to ensure proper
319
             * delivery when the channel is in an unknown state. chr(0xFF) is
320
             * never a valid ASCII/UTF-8 sequence, so this should reliably
321
             * induce an error/flush state.
322
             */
323
            lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
324
            QDECREF(lexer->token);
325
            lexer->token = qstring_new();
326
            new_state = IN_START;
327
            lexer->state = new_state;
328
            return 0;
329
        default:
330
            break;
331
        }
332
        lexer->state = new_state;
333
    } while (!char_consumed && !flush);
334

    
335
    /* Do not let a single token grow to an arbitrarily large size,
336
     * this is a security consideration.
337
     */
338
    if (lexer->token->length > MAX_TOKEN_SIZE) {
339
        lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
340
        QDECREF(lexer->token);
341
        lexer->token = qstring_new();
342
        lexer->state = IN_START;
343
    }
344

    
345
    return 0;
346
}
347

    
348
int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
349
{
350
    size_t i;
351

    
352
    for (i = 0; i < size; i++) {
353
        int err;
354

    
355
        err = json_lexer_feed_char(lexer, buffer[i], false);
356
        if (err < 0) {
357
            return err;
358
        }
359
    }
360

    
361
    return 0;
362
}
363

    
364
int json_lexer_flush(JSONLexer *lexer)
365
{
366
    return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true);
367
}
368

    
369
void json_lexer_destroy(JSONLexer *lexer)
370
{
371
    QDECREF(lexer->token);
372
}