Statistics
| Branch: | Revision:

root / json-lexer.c @ a74cdab4

History | View | Annotate | Download (7.8 kB)

1
/*
2
 * JSON lexer
3
 *
4
 * Copyright IBM, Corp. 2009
5
 *
6
 * Authors:
7
 *  Anthony Liguori   <aliguori@us.ibm.com>
8
 *
9
 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
10
 * See the COPYING.LIB file in the top-level directory.
11
 *
12
 */
13

    
14
#include "qstring.h"
15
#include "qlist.h"
16
#include "qdict.h"
17
#include "qint.h"
18
#include "qemu-common.h"
19
#include "json-lexer.h"
20

    
21
/*
22
 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
23
 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
24
 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
25
 * [{}\[\],:]
26
 * [a-z]+
27
 *
28
 */
29

    
30
enum json_lexer_state {
31
    IN_ERROR = 0,
32
    IN_DQ_UCODE3,
33
    IN_DQ_UCODE2,
34
    IN_DQ_UCODE1,
35
    IN_DQ_UCODE0,
36
    IN_DQ_STRING_ESCAPE,
37
    IN_DQ_STRING,
38
    IN_SQ_UCODE3,
39
    IN_SQ_UCODE2,
40
    IN_SQ_UCODE1,
41
    IN_SQ_UCODE0,
42
    IN_SQ_STRING_ESCAPE,
43
    IN_SQ_STRING,
44
    IN_ZERO,
45
    IN_DIGITS,
46
    IN_DIGIT,
47
    IN_EXP_E,
48
    IN_MANTISSA,
49
    IN_MANTISSA_DIGITS,
50
    IN_NONZERO_NUMBER,
51
    IN_NEG_NONZERO_NUMBER,
52
    IN_KEYWORD,
53
    IN_ESCAPE,
54
    IN_ESCAPE_L,
55
    IN_ESCAPE_LL,
56
    IN_ESCAPE_I,
57
    IN_ESCAPE_I6,
58
    IN_ESCAPE_I64,
59
    IN_WHITESPACE,
60
    IN_START,
61
};
62

    
63
#define TERMINAL(state) [0 ... 0x7F] = (state)
64

    
65
/* Return whether TERMINAL is a terminal state and the transition to it
66
   from OLD_STATE required lookahead.  This happens whenever the table
67
   below uses the TERMINAL macro.  */
68
#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
69
            (json_lexer[(old_state)][0] == (terminal))
70

    
71
static const uint8_t json_lexer[][256] =  {
72
    /* double quote string */
73
    [IN_DQ_UCODE3] = {
74
        ['0' ... '9'] = IN_DQ_STRING,
75
        ['a' ... 'f'] = IN_DQ_STRING,
76
        ['A' ... 'F'] = IN_DQ_STRING,
77
    },
78
    [IN_DQ_UCODE2] = {
79
        ['0' ... '9'] = IN_DQ_UCODE3,
80
        ['a' ... 'f'] = IN_DQ_UCODE3,
81
        ['A' ... 'F'] = IN_DQ_UCODE3,
82
    },
83
    [IN_DQ_UCODE1] = {
84
        ['0' ... '9'] = IN_DQ_UCODE2,
85
        ['a' ... 'f'] = IN_DQ_UCODE2,
86
        ['A' ... 'F'] = IN_DQ_UCODE2,
87
    },
88
    [IN_DQ_UCODE0] = {
89
        ['0' ... '9'] = IN_DQ_UCODE1,
90
        ['a' ... 'f'] = IN_DQ_UCODE1,
91
        ['A' ... 'F'] = IN_DQ_UCODE1,
92
    },
93
    [IN_DQ_STRING_ESCAPE] = {
94
        ['b'] = IN_DQ_STRING,
95
        ['f'] =  IN_DQ_STRING,
96
        ['n'] =  IN_DQ_STRING,
97
        ['r'] =  IN_DQ_STRING,
98
        ['t'] =  IN_DQ_STRING,
99
        ['/'] = IN_DQ_STRING,
100
        ['\\'] = IN_DQ_STRING,
101
        ['\''] = IN_DQ_STRING,
102
        ['\"'] = IN_DQ_STRING,
103
        ['u'] = IN_DQ_UCODE0,
104
    },
105
    [IN_DQ_STRING] = {
106
        [1 ... 0xFF] = IN_DQ_STRING,
107
        ['\\'] = IN_DQ_STRING_ESCAPE,
108
        ['"'] = JSON_STRING,
109
    },
110

    
111
    /* single quote string */
112
    [IN_SQ_UCODE3] = {
113
        ['0' ... '9'] = IN_SQ_STRING,
114
        ['a' ... 'f'] = IN_SQ_STRING,
115
        ['A' ... 'F'] = IN_SQ_STRING,
116
    },
117
    [IN_SQ_UCODE2] = {
118
        ['0' ... '9'] = IN_SQ_UCODE3,
119
        ['a' ... 'f'] = IN_SQ_UCODE3,
120
        ['A' ... 'F'] = IN_SQ_UCODE3,
121
    },
122
    [IN_SQ_UCODE1] = {
123
        ['0' ... '9'] = IN_SQ_UCODE2,
124
        ['a' ... 'f'] = IN_SQ_UCODE2,
125
        ['A' ... 'F'] = IN_SQ_UCODE2,
126
    },
127
    [IN_SQ_UCODE0] = {
128
        ['0' ... '9'] = IN_SQ_UCODE1,
129
        ['a' ... 'f'] = IN_SQ_UCODE1,
130
        ['A' ... 'F'] = IN_SQ_UCODE1,
131
    },
132
    [IN_SQ_STRING_ESCAPE] = {
133
        ['b'] = IN_SQ_STRING,
134
        ['f'] =  IN_SQ_STRING,
135
        ['n'] =  IN_SQ_STRING,
136
        ['r'] =  IN_SQ_STRING,
137
        ['t'] =  IN_SQ_STRING,
138
        ['/'] = IN_DQ_STRING,
139
        ['\\'] = IN_DQ_STRING,
140
        ['\''] = IN_SQ_STRING,
141
        ['\"'] = IN_SQ_STRING,
142
        ['u'] = IN_SQ_UCODE0,
143
    },
144
    [IN_SQ_STRING] = {
145
        [1 ... 0xFF] = IN_SQ_STRING,
146
        ['\\'] = IN_SQ_STRING_ESCAPE,
147
        ['\''] = JSON_STRING,
148
    },
149

    
150
    /* Zero */
151
    [IN_ZERO] = {
152
        TERMINAL(JSON_INTEGER),
153
        ['0' ... '9'] = IN_ERROR,
154
        ['.'] = IN_MANTISSA,
155
    },
156

    
157
    /* Float */
158
    [IN_DIGITS] = {
159
        TERMINAL(JSON_FLOAT),
160
        ['0' ... '9'] = IN_DIGITS,
161
    },
162

    
163
    [IN_DIGIT] = {
164
        ['0' ... '9'] = IN_DIGITS,
165
    },
166

    
167
    [IN_EXP_E] = {
168
        ['-'] = IN_DIGIT,
169
        ['+'] = IN_DIGIT,
170
        ['0' ... '9'] = IN_DIGITS,
171
    },
172

    
173
    [IN_MANTISSA_DIGITS] = {
174
        TERMINAL(JSON_FLOAT),
175
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
176
        ['e'] = IN_EXP_E,
177
        ['E'] = IN_EXP_E,
178
    },
179

    
180
    [IN_MANTISSA] = {
181
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
182
    },
183

    
184
    /* Number */
185
    [IN_NONZERO_NUMBER] = {
186
        TERMINAL(JSON_INTEGER),
187
        ['0' ... '9'] = IN_NONZERO_NUMBER,
188
        ['e'] = IN_EXP_E,
189
        ['E'] = IN_EXP_E,
190
        ['.'] = IN_MANTISSA,
191
    },
192

    
193
    [IN_NEG_NONZERO_NUMBER] = {
194
        ['0'] = IN_ZERO,
195
        ['1' ... '9'] = IN_NONZERO_NUMBER,
196
    },
197

    
198
    /* keywords */
199
    [IN_KEYWORD] = {
200
        TERMINAL(JSON_KEYWORD),
201
        ['a' ... 'z'] = IN_KEYWORD,
202
    },
203

    
204
    /* whitespace */
205
    [IN_WHITESPACE] = {
206
        TERMINAL(JSON_SKIP),
207
        [' '] = IN_WHITESPACE,
208
        ['\t'] = IN_WHITESPACE,
209
        ['\r'] = IN_WHITESPACE,
210
        ['\n'] = IN_WHITESPACE,
211
    },        
212

    
213
    /* escape */
214
    [IN_ESCAPE_LL] = {
215
        ['d'] = JSON_ESCAPE,
216
    },
217

    
218
    [IN_ESCAPE_L] = {
219
        ['d'] = JSON_ESCAPE,
220
        ['l'] = IN_ESCAPE_LL,
221
    },
222

    
223
    [IN_ESCAPE_I64] = {
224
        ['d'] = JSON_ESCAPE,
225
    },
226

    
227
    [IN_ESCAPE_I6] = {
228
        ['4'] = IN_ESCAPE_I64,
229
    },
230

    
231
    [IN_ESCAPE_I] = {
232
        ['6'] = IN_ESCAPE_I6,
233
    },
234

    
235
    [IN_ESCAPE] = {
236
        ['d'] = JSON_ESCAPE,
237
        ['i'] = JSON_ESCAPE,
238
        ['p'] = JSON_ESCAPE,
239
        ['s'] = JSON_ESCAPE,
240
        ['f'] = JSON_ESCAPE,
241
        ['l'] = IN_ESCAPE_L,
242
        ['I'] = IN_ESCAPE_I,
243
    },
244

    
245
    /* top level rule */
246
    [IN_START] = {
247
        ['"'] = IN_DQ_STRING,
248
        ['\''] = IN_SQ_STRING,
249
        ['0'] = IN_ZERO,
250
        ['1' ... '9'] = IN_NONZERO_NUMBER,
251
        ['-'] = IN_NEG_NONZERO_NUMBER,
252
        ['{'] = JSON_OPERATOR,
253
        ['}'] = JSON_OPERATOR,
254
        ['['] = JSON_OPERATOR,
255
        [']'] = JSON_OPERATOR,
256
        [','] = JSON_OPERATOR,
257
        [':'] = JSON_OPERATOR,
258
        ['a' ... 'z'] = IN_KEYWORD,
259
        ['%'] = IN_ESCAPE,
260
        [' '] = IN_WHITESPACE,
261
        ['\t'] = IN_WHITESPACE,
262
        ['\r'] = IN_WHITESPACE,
263
        ['\n'] = IN_WHITESPACE,
264
    },
265
};
266

    
267
void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
268
{
269
    lexer->emit = func;
270
    lexer->state = IN_START;
271
    lexer->token = qstring_new();
272
    lexer->x = lexer->y = 0;
273
}
274

    
275
static int json_lexer_feed_char(JSONLexer *lexer, char ch)
276
{
277
    int char_consumed, new_state;
278

    
279
    lexer->x++;
280
    if (ch == '\n') {
281
        lexer->x = 0;
282
        lexer->y++;
283
    }
284

    
285
    do {
286
        new_state = json_lexer[lexer->state][(uint8_t)ch];
287
        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
288
        if (char_consumed) {
289
            qstring_append_chr(lexer->token, ch);
290
        }
291

    
292
        switch (new_state) {
293
        case JSON_OPERATOR:
294
        case JSON_ESCAPE:
295
        case JSON_INTEGER:
296
        case JSON_FLOAT:
297
        case JSON_KEYWORD:
298
        case JSON_STRING:
299
            lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
300
        case JSON_SKIP:
301
            QDECREF(lexer->token);
302
            lexer->token = qstring_new();
303
            new_state = IN_START;
304
            break;
305
        case IN_ERROR:
306
            return -EINVAL;
307
        default:
308
            break;
309
        }
310
        lexer->state = new_state;
311
    } while (!char_consumed);
312
    return 0;
313
}
314

    
315
int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
316
{
317
    size_t i;
318

    
319
    for (i = 0; i < size; i++) {
320
        int err;
321

    
322
        err = json_lexer_feed_char(lexer, buffer[i]);
323
        if (err < 0) {
324
            return err;
325
        }
326
    }
327

    
328
    return 0;
329
}
330

    
331
int json_lexer_flush(JSONLexer *lexer)
332
{
333
    return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0);
334
}
335

    
336
void json_lexer_destroy(JSONLexer *lexer)
337
{
338
    QDECREF(lexer->token);
339
}