root / json-lexer.c @ a74cdab4
History | View | Annotate | Download (7.8 kB)
1 |
/*
|
---|---|
2 |
* JSON lexer
|
3 |
*
|
4 |
* Copyright IBM, Corp. 2009
|
5 |
*
|
6 |
* Authors:
|
7 |
* Anthony Liguori <aliguori@us.ibm.com>
|
8 |
*
|
9 |
* This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
|
10 |
* See the COPYING.LIB file in the top-level directory.
|
11 |
*
|
12 |
*/
|
13 |
|
14 |
#include "qstring.h" |
15 |
#include "qlist.h" |
16 |
#include "qdict.h" |
17 |
#include "qint.h" |
18 |
#include "qemu-common.h" |
19 |
#include "json-lexer.h" |
20 |
|
21 |
/*
|
22 |
* \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
|
23 |
* '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
|
24 |
* 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
|
25 |
* [{}\[\],:]
|
26 |
* [a-z]+
|
27 |
*
|
28 |
*/
|
29 |
|
30 |
enum json_lexer_state {
|
31 |
IN_ERROR = 0,
|
32 |
IN_DQ_UCODE3, |
33 |
IN_DQ_UCODE2, |
34 |
IN_DQ_UCODE1, |
35 |
IN_DQ_UCODE0, |
36 |
IN_DQ_STRING_ESCAPE, |
37 |
IN_DQ_STRING, |
38 |
IN_SQ_UCODE3, |
39 |
IN_SQ_UCODE2, |
40 |
IN_SQ_UCODE1, |
41 |
IN_SQ_UCODE0, |
42 |
IN_SQ_STRING_ESCAPE, |
43 |
IN_SQ_STRING, |
44 |
IN_ZERO, |
45 |
IN_DIGITS, |
46 |
IN_DIGIT, |
47 |
IN_EXP_E, |
48 |
IN_MANTISSA, |
49 |
IN_MANTISSA_DIGITS, |
50 |
IN_NONZERO_NUMBER, |
51 |
IN_NEG_NONZERO_NUMBER, |
52 |
IN_KEYWORD, |
53 |
IN_ESCAPE, |
54 |
IN_ESCAPE_L, |
55 |
IN_ESCAPE_LL, |
56 |
IN_ESCAPE_I, |
57 |
IN_ESCAPE_I6, |
58 |
IN_ESCAPE_I64, |
59 |
IN_WHITESPACE, |
60 |
IN_START, |
61 |
}; |
62 |
|
63 |
#define TERMINAL(state) [0 ... 0x7F] = (state) |
64 |
|
65 |
/* Return whether TERMINAL is a terminal state and the transition to it
|
66 |
from OLD_STATE required lookahead. This happens whenever the table
|
67 |
below uses the TERMINAL macro. */
|
68 |
#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
|
69 |
(json_lexer[(old_state)][0] == (terminal))
|
70 |
|
71 |
static const uint8_t json_lexer[][256] = { |
72 |
/* double quote string */
|
73 |
[IN_DQ_UCODE3] = { |
74 |
['0' ... '9'] = IN_DQ_STRING, |
75 |
['a' ... 'f'] = IN_DQ_STRING, |
76 |
['A' ... 'F'] = IN_DQ_STRING, |
77 |
}, |
78 |
[IN_DQ_UCODE2] = { |
79 |
['0' ... '9'] = IN_DQ_UCODE3, |
80 |
['a' ... 'f'] = IN_DQ_UCODE3, |
81 |
['A' ... 'F'] = IN_DQ_UCODE3, |
82 |
}, |
83 |
[IN_DQ_UCODE1] = { |
84 |
['0' ... '9'] = IN_DQ_UCODE2, |
85 |
['a' ... 'f'] = IN_DQ_UCODE2, |
86 |
['A' ... 'F'] = IN_DQ_UCODE2, |
87 |
}, |
88 |
[IN_DQ_UCODE0] = { |
89 |
['0' ... '9'] = IN_DQ_UCODE1, |
90 |
['a' ... 'f'] = IN_DQ_UCODE1, |
91 |
['A' ... 'F'] = IN_DQ_UCODE1, |
92 |
}, |
93 |
[IN_DQ_STRING_ESCAPE] = { |
94 |
['b'] = IN_DQ_STRING,
|
95 |
['f'] = IN_DQ_STRING,
|
96 |
['n'] = IN_DQ_STRING,
|
97 |
['r'] = IN_DQ_STRING,
|
98 |
['t'] = IN_DQ_STRING,
|
99 |
['/'] = IN_DQ_STRING,
|
100 |
['\\'] = IN_DQ_STRING,
|
101 |
['\''] = IN_DQ_STRING,
|
102 |
['\"'] = IN_DQ_STRING,
|
103 |
['u'] = IN_DQ_UCODE0,
|
104 |
}, |
105 |
[IN_DQ_STRING] = { |
106 |
[1 ... 0xFF] = IN_DQ_STRING, |
107 |
['\\'] = IN_DQ_STRING_ESCAPE,
|
108 |
['"'] = JSON_STRING,
|
109 |
}, |
110 |
|
111 |
/* single quote string */
|
112 |
[IN_SQ_UCODE3] = { |
113 |
['0' ... '9'] = IN_SQ_STRING, |
114 |
['a' ... 'f'] = IN_SQ_STRING, |
115 |
['A' ... 'F'] = IN_SQ_STRING, |
116 |
}, |
117 |
[IN_SQ_UCODE2] = { |
118 |
['0' ... '9'] = IN_SQ_UCODE3, |
119 |
['a' ... 'f'] = IN_SQ_UCODE3, |
120 |
['A' ... 'F'] = IN_SQ_UCODE3, |
121 |
}, |
122 |
[IN_SQ_UCODE1] = { |
123 |
['0' ... '9'] = IN_SQ_UCODE2, |
124 |
['a' ... 'f'] = IN_SQ_UCODE2, |
125 |
['A' ... 'F'] = IN_SQ_UCODE2, |
126 |
}, |
127 |
[IN_SQ_UCODE0] = { |
128 |
['0' ... '9'] = IN_SQ_UCODE1, |
129 |
['a' ... 'f'] = IN_SQ_UCODE1, |
130 |
['A' ... 'F'] = IN_SQ_UCODE1, |
131 |
}, |
132 |
[IN_SQ_STRING_ESCAPE] = { |
133 |
['b'] = IN_SQ_STRING,
|
134 |
['f'] = IN_SQ_STRING,
|
135 |
['n'] = IN_SQ_STRING,
|
136 |
['r'] = IN_SQ_STRING,
|
137 |
['t'] = IN_SQ_STRING,
|
138 |
['/'] = IN_DQ_STRING,
|
139 |
['\\'] = IN_DQ_STRING,
|
140 |
['\''] = IN_SQ_STRING,
|
141 |
['\"'] = IN_SQ_STRING,
|
142 |
['u'] = IN_SQ_UCODE0,
|
143 |
}, |
144 |
[IN_SQ_STRING] = { |
145 |
[1 ... 0xFF] = IN_SQ_STRING, |
146 |
['\\'] = IN_SQ_STRING_ESCAPE,
|
147 |
['\''] = JSON_STRING,
|
148 |
}, |
149 |
|
150 |
/* Zero */
|
151 |
[IN_ZERO] = { |
152 |
TERMINAL(JSON_INTEGER), |
153 |
['0' ... '9'] = IN_ERROR, |
154 |
['.'] = IN_MANTISSA,
|
155 |
}, |
156 |
|
157 |
/* Float */
|
158 |
[IN_DIGITS] = { |
159 |
TERMINAL(JSON_FLOAT), |
160 |
['0' ... '9'] = IN_DIGITS, |
161 |
}, |
162 |
|
163 |
[IN_DIGIT] = { |
164 |
['0' ... '9'] = IN_DIGITS, |
165 |
}, |
166 |
|
167 |
[IN_EXP_E] = { |
168 |
['-'] = IN_DIGIT,
|
169 |
['+'] = IN_DIGIT,
|
170 |
['0' ... '9'] = IN_DIGITS, |
171 |
}, |
172 |
|
173 |
[IN_MANTISSA_DIGITS] = { |
174 |
TERMINAL(JSON_FLOAT), |
175 |
['0' ... '9'] = IN_MANTISSA_DIGITS, |
176 |
['e'] = IN_EXP_E,
|
177 |
['E'] = IN_EXP_E,
|
178 |
}, |
179 |
|
180 |
[IN_MANTISSA] = { |
181 |
['0' ... '9'] = IN_MANTISSA_DIGITS, |
182 |
}, |
183 |
|
184 |
/* Number */
|
185 |
[IN_NONZERO_NUMBER] = { |
186 |
TERMINAL(JSON_INTEGER), |
187 |
['0' ... '9'] = IN_NONZERO_NUMBER, |
188 |
['e'] = IN_EXP_E,
|
189 |
['E'] = IN_EXP_E,
|
190 |
['.'] = IN_MANTISSA,
|
191 |
}, |
192 |
|
193 |
[IN_NEG_NONZERO_NUMBER] = { |
194 |
['0'] = IN_ZERO,
|
195 |
['1' ... '9'] = IN_NONZERO_NUMBER, |
196 |
}, |
197 |
|
198 |
/* keywords */
|
199 |
[IN_KEYWORD] = { |
200 |
TERMINAL(JSON_KEYWORD), |
201 |
['a' ... 'z'] = IN_KEYWORD, |
202 |
}, |
203 |
|
204 |
/* whitespace */
|
205 |
[IN_WHITESPACE] = { |
206 |
TERMINAL(JSON_SKIP), |
207 |
[' '] = IN_WHITESPACE,
|
208 |
['\t'] = IN_WHITESPACE,
|
209 |
['\r'] = IN_WHITESPACE,
|
210 |
['\n'] = IN_WHITESPACE,
|
211 |
}, |
212 |
|
213 |
/* escape */
|
214 |
[IN_ESCAPE_LL] = { |
215 |
['d'] = JSON_ESCAPE,
|
216 |
}, |
217 |
|
218 |
[IN_ESCAPE_L] = { |
219 |
['d'] = JSON_ESCAPE,
|
220 |
['l'] = IN_ESCAPE_LL,
|
221 |
}, |
222 |
|
223 |
[IN_ESCAPE_I64] = { |
224 |
['d'] = JSON_ESCAPE,
|
225 |
}, |
226 |
|
227 |
[IN_ESCAPE_I6] = { |
228 |
['4'] = IN_ESCAPE_I64,
|
229 |
}, |
230 |
|
231 |
[IN_ESCAPE_I] = { |
232 |
['6'] = IN_ESCAPE_I6,
|
233 |
}, |
234 |
|
235 |
[IN_ESCAPE] = { |
236 |
['d'] = JSON_ESCAPE,
|
237 |
['i'] = JSON_ESCAPE,
|
238 |
['p'] = JSON_ESCAPE,
|
239 |
['s'] = JSON_ESCAPE,
|
240 |
['f'] = JSON_ESCAPE,
|
241 |
['l'] = IN_ESCAPE_L,
|
242 |
['I'] = IN_ESCAPE_I,
|
243 |
}, |
244 |
|
245 |
/* top level rule */
|
246 |
[IN_START] = { |
247 |
['"'] = IN_DQ_STRING,
|
248 |
['\''] = IN_SQ_STRING,
|
249 |
['0'] = IN_ZERO,
|
250 |
['1' ... '9'] = IN_NONZERO_NUMBER, |
251 |
['-'] = IN_NEG_NONZERO_NUMBER,
|
252 |
['{'] = JSON_OPERATOR,
|
253 |
['}'] = JSON_OPERATOR,
|
254 |
['['] = JSON_OPERATOR,
|
255 |
[']'] = JSON_OPERATOR,
|
256 |
[','] = JSON_OPERATOR,
|
257 |
[':'] = JSON_OPERATOR,
|
258 |
['a' ... 'z'] = IN_KEYWORD, |
259 |
['%'] = IN_ESCAPE,
|
260 |
[' '] = IN_WHITESPACE,
|
261 |
['\t'] = IN_WHITESPACE,
|
262 |
['\r'] = IN_WHITESPACE,
|
263 |
['\n'] = IN_WHITESPACE,
|
264 |
}, |
265 |
}; |
266 |
|
267 |
void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
|
268 |
{ |
269 |
lexer->emit = func; |
270 |
lexer->state = IN_START; |
271 |
lexer->token = qstring_new(); |
272 |
lexer->x = lexer->y = 0;
|
273 |
} |
274 |
|
275 |
static int json_lexer_feed_char(JSONLexer *lexer, char ch) |
276 |
{ |
277 |
int char_consumed, new_state;
|
278 |
|
279 |
lexer->x++; |
280 |
if (ch == '\n') { |
281 |
lexer->x = 0;
|
282 |
lexer->y++; |
283 |
} |
284 |
|
285 |
do {
|
286 |
new_state = json_lexer[lexer->state][(uint8_t)ch]; |
287 |
char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); |
288 |
if (char_consumed) {
|
289 |
qstring_append_chr(lexer->token, ch); |
290 |
} |
291 |
|
292 |
switch (new_state) {
|
293 |
case JSON_OPERATOR:
|
294 |
case JSON_ESCAPE:
|
295 |
case JSON_INTEGER:
|
296 |
case JSON_FLOAT:
|
297 |
case JSON_KEYWORD:
|
298 |
case JSON_STRING:
|
299 |
lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y); |
300 |
case JSON_SKIP:
|
301 |
QDECREF(lexer->token); |
302 |
lexer->token = qstring_new(); |
303 |
new_state = IN_START; |
304 |
break;
|
305 |
case IN_ERROR:
|
306 |
return -EINVAL;
|
307 |
default:
|
308 |
break;
|
309 |
} |
310 |
lexer->state = new_state; |
311 |
} while (!char_consumed);
|
312 |
return 0; |
313 |
} |
314 |
|
315 |
int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) |
316 |
{ |
317 |
size_t i; |
318 |
|
319 |
for (i = 0; i < size; i++) { |
320 |
int err;
|
321 |
|
322 |
err = json_lexer_feed_char(lexer, buffer[i]); |
323 |
if (err < 0) { |
324 |
return err;
|
325 |
} |
326 |
} |
327 |
|
328 |
return 0; |
329 |
} |
330 |
|
331 |
int json_lexer_flush(JSONLexer *lexer)
|
332 |
{ |
333 |
return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0); |
334 |
} |
335 |
|
336 |
void json_lexer_destroy(JSONLexer *lexer)
|
337 |
{ |
338 |
QDECREF(lexer->token); |
339 |
} |