root / json-lexer.c @ 325601b4
History | View | Annotate | Download (8.1 kB)
1 |
/*
|
---|---|
2 |
* JSON lexer
|
3 |
*
|
4 |
* Copyright IBM, Corp. 2009
|
5 |
*
|
6 |
* Authors:
|
7 |
* Anthony Liguori <aliguori@us.ibm.com>
|
8 |
*
|
9 |
* This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
|
10 |
* See the COPYING.LIB file in the top-level directory.
|
11 |
*
|
12 |
*/
|
13 |
|
14 |
#include "qstring.h" |
15 |
#include "qlist.h" |
16 |
#include "qdict.h" |
17 |
#include "qint.h" |
18 |
#include "qemu-common.h" |
19 |
#include "json-lexer.h" |
20 |
|
21 |
#define MAX_TOKEN_SIZE (64ULL << 20) |
22 |
|
23 |
/*
|
24 |
* \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
|
25 |
* '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
|
26 |
* 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
|
27 |
* [{}\[\],:]
|
28 |
* [a-z]+
|
29 |
*
|
30 |
*/
|
31 |
|
32 |
enum json_lexer_state {
|
33 |
IN_ERROR = 0,
|
34 |
IN_DQ_UCODE3, |
35 |
IN_DQ_UCODE2, |
36 |
IN_DQ_UCODE1, |
37 |
IN_DQ_UCODE0, |
38 |
IN_DQ_STRING_ESCAPE, |
39 |
IN_DQ_STRING, |
40 |
IN_SQ_UCODE3, |
41 |
IN_SQ_UCODE2, |
42 |
IN_SQ_UCODE1, |
43 |
IN_SQ_UCODE0, |
44 |
IN_SQ_STRING_ESCAPE, |
45 |
IN_SQ_STRING, |
46 |
IN_ZERO, |
47 |
IN_DIGITS, |
48 |
IN_DIGIT, |
49 |
IN_EXP_E, |
50 |
IN_MANTISSA, |
51 |
IN_MANTISSA_DIGITS, |
52 |
IN_NONZERO_NUMBER, |
53 |
IN_NEG_NONZERO_NUMBER, |
54 |
IN_KEYWORD, |
55 |
IN_ESCAPE, |
56 |
IN_ESCAPE_L, |
57 |
IN_ESCAPE_LL, |
58 |
IN_ESCAPE_I, |
59 |
IN_ESCAPE_I6, |
60 |
IN_ESCAPE_I64, |
61 |
IN_WHITESPACE, |
62 |
IN_START, |
63 |
}; |
64 |
|
65 |
#define TERMINAL(state) [0 ... 0x7F] = (state) |
66 |
|
67 |
/* Return whether TERMINAL is a terminal state and the transition to it
|
68 |
from OLD_STATE required lookahead. This happens whenever the table
|
69 |
below uses the TERMINAL macro. */
|
70 |
#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
|
71 |
(json_lexer[(old_state)][0] == (terminal))
|
72 |
|
73 |
static const uint8_t json_lexer[][256] = { |
74 |
/* double quote string */
|
75 |
[IN_DQ_UCODE3] = { |
76 |
['0' ... '9'] = IN_DQ_STRING, |
77 |
['a' ... 'f'] = IN_DQ_STRING, |
78 |
['A' ... 'F'] = IN_DQ_STRING, |
79 |
}, |
80 |
[IN_DQ_UCODE2] = { |
81 |
['0' ... '9'] = IN_DQ_UCODE3, |
82 |
['a' ... 'f'] = IN_DQ_UCODE3, |
83 |
['A' ... 'F'] = IN_DQ_UCODE3, |
84 |
}, |
85 |
[IN_DQ_UCODE1] = { |
86 |
['0' ... '9'] = IN_DQ_UCODE2, |
87 |
['a' ... 'f'] = IN_DQ_UCODE2, |
88 |
['A' ... 'F'] = IN_DQ_UCODE2, |
89 |
}, |
90 |
[IN_DQ_UCODE0] = { |
91 |
['0' ... '9'] = IN_DQ_UCODE1, |
92 |
['a' ... 'f'] = IN_DQ_UCODE1, |
93 |
['A' ... 'F'] = IN_DQ_UCODE1, |
94 |
}, |
95 |
[IN_DQ_STRING_ESCAPE] = { |
96 |
['b'] = IN_DQ_STRING,
|
97 |
['f'] = IN_DQ_STRING,
|
98 |
['n'] = IN_DQ_STRING,
|
99 |
['r'] = IN_DQ_STRING,
|
100 |
['t'] = IN_DQ_STRING,
|
101 |
['/'] = IN_DQ_STRING,
|
102 |
['\\'] = IN_DQ_STRING,
|
103 |
['\''] = IN_DQ_STRING,
|
104 |
['\"'] = IN_DQ_STRING,
|
105 |
['u'] = IN_DQ_UCODE0,
|
106 |
}, |
107 |
[IN_DQ_STRING] = { |
108 |
[1 ... 0xFF] = IN_DQ_STRING, |
109 |
['\\'] = IN_DQ_STRING_ESCAPE,
|
110 |
['"'] = JSON_STRING,
|
111 |
}, |
112 |
|
113 |
/* single quote string */
|
114 |
[IN_SQ_UCODE3] = { |
115 |
['0' ... '9'] = IN_SQ_STRING, |
116 |
['a' ... 'f'] = IN_SQ_STRING, |
117 |
['A' ... 'F'] = IN_SQ_STRING, |
118 |
}, |
119 |
[IN_SQ_UCODE2] = { |
120 |
['0' ... '9'] = IN_SQ_UCODE3, |
121 |
['a' ... 'f'] = IN_SQ_UCODE3, |
122 |
['A' ... 'F'] = IN_SQ_UCODE3, |
123 |
}, |
124 |
[IN_SQ_UCODE1] = { |
125 |
['0' ... '9'] = IN_SQ_UCODE2, |
126 |
['a' ... 'f'] = IN_SQ_UCODE2, |
127 |
['A' ... 'F'] = IN_SQ_UCODE2, |
128 |
}, |
129 |
[IN_SQ_UCODE0] = { |
130 |
['0' ... '9'] = IN_SQ_UCODE1, |
131 |
['a' ... 'f'] = IN_SQ_UCODE1, |
132 |
['A' ... 'F'] = IN_SQ_UCODE1, |
133 |
}, |
134 |
[IN_SQ_STRING_ESCAPE] = { |
135 |
['b'] = IN_SQ_STRING,
|
136 |
['f'] = IN_SQ_STRING,
|
137 |
['n'] = IN_SQ_STRING,
|
138 |
['r'] = IN_SQ_STRING,
|
139 |
['t'] = IN_SQ_STRING,
|
140 |
['/'] = IN_DQ_STRING,
|
141 |
['\\'] = IN_DQ_STRING,
|
142 |
['\''] = IN_SQ_STRING,
|
143 |
['\"'] = IN_SQ_STRING,
|
144 |
['u'] = IN_SQ_UCODE0,
|
145 |
}, |
146 |
[IN_SQ_STRING] = { |
147 |
[1 ... 0xFF] = IN_SQ_STRING, |
148 |
['\\'] = IN_SQ_STRING_ESCAPE,
|
149 |
['\''] = JSON_STRING,
|
150 |
}, |
151 |
|
152 |
/* Zero */
|
153 |
[IN_ZERO] = { |
154 |
TERMINAL(JSON_INTEGER), |
155 |
['0' ... '9'] = IN_ERROR, |
156 |
['.'] = IN_MANTISSA,
|
157 |
}, |
158 |
|
159 |
/* Float */
|
160 |
[IN_DIGITS] = { |
161 |
TERMINAL(JSON_FLOAT), |
162 |
['0' ... '9'] = IN_DIGITS, |
163 |
}, |
164 |
|
165 |
[IN_DIGIT] = { |
166 |
['0' ... '9'] = IN_DIGITS, |
167 |
}, |
168 |
|
169 |
[IN_EXP_E] = { |
170 |
['-'] = IN_DIGIT,
|
171 |
['+'] = IN_DIGIT,
|
172 |
['0' ... '9'] = IN_DIGITS, |
173 |
}, |
174 |
|
175 |
[IN_MANTISSA_DIGITS] = { |
176 |
TERMINAL(JSON_FLOAT), |
177 |
['0' ... '9'] = IN_MANTISSA_DIGITS, |
178 |
['e'] = IN_EXP_E,
|
179 |
['E'] = IN_EXP_E,
|
180 |
}, |
181 |
|
182 |
[IN_MANTISSA] = { |
183 |
['0' ... '9'] = IN_MANTISSA_DIGITS, |
184 |
}, |
185 |
|
186 |
/* Number */
|
187 |
[IN_NONZERO_NUMBER] = { |
188 |
TERMINAL(JSON_INTEGER), |
189 |
['0' ... '9'] = IN_NONZERO_NUMBER, |
190 |
['e'] = IN_EXP_E,
|
191 |
['E'] = IN_EXP_E,
|
192 |
['.'] = IN_MANTISSA,
|
193 |
}, |
194 |
|
195 |
[IN_NEG_NONZERO_NUMBER] = { |
196 |
['0'] = IN_ZERO,
|
197 |
['1' ... '9'] = IN_NONZERO_NUMBER, |
198 |
}, |
199 |
|
200 |
/* keywords */
|
201 |
[IN_KEYWORD] = { |
202 |
TERMINAL(JSON_KEYWORD), |
203 |
['a' ... 'z'] = IN_KEYWORD, |
204 |
}, |
205 |
|
206 |
/* whitespace */
|
207 |
[IN_WHITESPACE] = { |
208 |
TERMINAL(JSON_SKIP), |
209 |
[' '] = IN_WHITESPACE,
|
210 |
['\t'] = IN_WHITESPACE,
|
211 |
['\r'] = IN_WHITESPACE,
|
212 |
['\n'] = IN_WHITESPACE,
|
213 |
}, |
214 |
|
215 |
/* escape */
|
216 |
[IN_ESCAPE_LL] = { |
217 |
['d'] = JSON_ESCAPE,
|
218 |
}, |
219 |
|
220 |
[IN_ESCAPE_L] = { |
221 |
['d'] = JSON_ESCAPE,
|
222 |
['l'] = IN_ESCAPE_LL,
|
223 |
}, |
224 |
|
225 |
[IN_ESCAPE_I64] = { |
226 |
['d'] = JSON_ESCAPE,
|
227 |
}, |
228 |
|
229 |
[IN_ESCAPE_I6] = { |
230 |
['4'] = IN_ESCAPE_I64,
|
231 |
}, |
232 |
|
233 |
[IN_ESCAPE_I] = { |
234 |
['6'] = IN_ESCAPE_I6,
|
235 |
}, |
236 |
|
237 |
[IN_ESCAPE] = { |
238 |
['d'] = JSON_ESCAPE,
|
239 |
['i'] = JSON_ESCAPE,
|
240 |
['p'] = JSON_ESCAPE,
|
241 |
['s'] = JSON_ESCAPE,
|
242 |
['f'] = JSON_ESCAPE,
|
243 |
['l'] = IN_ESCAPE_L,
|
244 |
['I'] = IN_ESCAPE_I,
|
245 |
}, |
246 |
|
247 |
/* top level rule */
|
248 |
[IN_START] = { |
249 |
['"'] = IN_DQ_STRING,
|
250 |
['\''] = IN_SQ_STRING,
|
251 |
['0'] = IN_ZERO,
|
252 |
['1' ... '9'] = IN_NONZERO_NUMBER, |
253 |
['-'] = IN_NEG_NONZERO_NUMBER,
|
254 |
['{'] = JSON_OPERATOR,
|
255 |
['}'] = JSON_OPERATOR,
|
256 |
['['] = JSON_OPERATOR,
|
257 |
[']'] = JSON_OPERATOR,
|
258 |
[','] = JSON_OPERATOR,
|
259 |
[':'] = JSON_OPERATOR,
|
260 |
['a' ... 'z'] = IN_KEYWORD, |
261 |
['%'] = IN_ESCAPE,
|
262 |
[' '] = IN_WHITESPACE,
|
263 |
['\t'] = IN_WHITESPACE,
|
264 |
['\r'] = IN_WHITESPACE,
|
265 |
['\n'] = IN_WHITESPACE,
|
266 |
}, |
267 |
}; |
268 |
|
269 |
void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
|
270 |
{ |
271 |
lexer->emit = func; |
272 |
lexer->state = IN_START; |
273 |
lexer->token = qstring_new(); |
274 |
lexer->x = lexer->y = 0;
|
275 |
} |
276 |
|
277 |
static int json_lexer_feed_char(JSONLexer *lexer, char ch) |
278 |
{ |
279 |
int char_consumed, new_state;
|
280 |
|
281 |
lexer->x++; |
282 |
if (ch == '\n') { |
283 |
lexer->x = 0;
|
284 |
lexer->y++; |
285 |
} |
286 |
|
287 |
do {
|
288 |
new_state = json_lexer[lexer->state][(uint8_t)ch]; |
289 |
char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); |
290 |
if (char_consumed) {
|
291 |
qstring_append_chr(lexer->token, ch); |
292 |
} |
293 |
|
294 |
switch (new_state) {
|
295 |
case JSON_OPERATOR:
|
296 |
case JSON_ESCAPE:
|
297 |
case JSON_INTEGER:
|
298 |
case JSON_FLOAT:
|
299 |
case JSON_KEYWORD:
|
300 |
case JSON_STRING:
|
301 |
lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y); |
302 |
case JSON_SKIP:
|
303 |
QDECREF(lexer->token); |
304 |
lexer->token = qstring_new(); |
305 |
new_state = IN_START; |
306 |
break;
|
307 |
case IN_ERROR:
|
308 |
return -EINVAL;
|
309 |
default:
|
310 |
break;
|
311 |
} |
312 |
lexer->state = new_state; |
313 |
} while (!char_consumed);
|
314 |
|
315 |
/* Do not let a single token grow to an arbitrarily large size,
|
316 |
* this is a security consideration.
|
317 |
*/
|
318 |
if (lexer->token->length > MAX_TOKEN_SIZE) {
|
319 |
lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y); |
320 |
QDECREF(lexer->token); |
321 |
lexer->token = qstring_new(); |
322 |
lexer->state = IN_START; |
323 |
} |
324 |
|
325 |
return 0; |
326 |
} |
327 |
|
328 |
int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) |
329 |
{ |
330 |
size_t i; |
331 |
|
332 |
for (i = 0; i < size; i++) { |
333 |
int err;
|
334 |
|
335 |
err = json_lexer_feed_char(lexer, buffer[i]); |
336 |
if (err < 0) { |
337 |
return err;
|
338 |
} |
339 |
} |
340 |
|
341 |
return 0; |
342 |
} |
343 |
|
344 |
int json_lexer_flush(JSONLexer *lexer)
|
345 |
{ |
346 |
return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0); |
347 |
} |
348 |
|
349 |
void json_lexer_destroy(JSONLexer *lexer)
|
350 |
{ |
351 |
QDECREF(lexer->token); |
352 |
} |