root / json-lexer.c @ 2c0d4b36
History | View | Annotate | Download (7.4 kB)
1 |
/*
|
---|---|
2 |
* JSON lexer
|
3 |
*
|
4 |
* Copyright IBM, Corp. 2009
|
5 |
*
|
6 |
* Authors:
|
7 |
* Anthony Liguori <aliguori@us.ibm.com>
|
8 |
*
|
9 |
* This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
|
10 |
* See the COPYING.LIB file in the top-level directory.
|
11 |
*
|
12 |
*/
|
13 |
|
14 |
#include "qstring.h" |
15 |
#include "qlist.h" |
16 |
#include "qdict.h" |
17 |
#include "qint.h" |
18 |
#include "qemu-common.h" |
19 |
#include "json-lexer.h" |
20 |
|
21 |
/*
|
22 |
* \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
|
23 |
* '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
|
24 |
* 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
|
25 |
* [{}\[\],:]
|
26 |
* [a-z]+
|
27 |
*
|
28 |
*/
|
29 |
|
30 |
enum json_lexer_state {
|
31 |
ERROR = 0,
|
32 |
IN_DONE_STRING, |
33 |
IN_DQ_UCODE3, |
34 |
IN_DQ_UCODE2, |
35 |
IN_DQ_UCODE1, |
36 |
IN_DQ_UCODE0, |
37 |
IN_DQ_STRING_ESCAPE, |
38 |
IN_DQ_STRING, |
39 |
IN_SQ_UCODE3, |
40 |
IN_SQ_UCODE2, |
41 |
IN_SQ_UCODE1, |
42 |
IN_SQ_UCODE0, |
43 |
IN_SQ_STRING_ESCAPE, |
44 |
IN_SQ_STRING, |
45 |
IN_ZERO, |
46 |
IN_DIGITS, |
47 |
IN_DIGIT, |
48 |
IN_EXP_E, |
49 |
IN_MANTISSA, |
50 |
IN_MANTISSA_DIGITS, |
51 |
IN_NONZERO_NUMBER, |
52 |
IN_NEG_NONZERO_NUMBER, |
53 |
IN_KEYWORD, |
54 |
IN_ESCAPE, |
55 |
IN_ESCAPE_L, |
56 |
IN_ESCAPE_LL, |
57 |
IN_ESCAPE_I, |
58 |
IN_ESCAPE_I6, |
59 |
IN_ESCAPE_I64, |
60 |
IN_ESCAPE_DONE, |
61 |
IN_WHITESPACE, |
62 |
IN_OPERATOR_DONE, |
63 |
IN_START, |
64 |
}; |
65 |
|
66 |
#define TERMINAL(state) [0 ... 0x7F] = (state) |
67 |
|
68 |
static const uint8_t json_lexer[][256] = { |
69 |
[IN_DONE_STRING] = { |
70 |
TERMINAL(JSON_STRING), |
71 |
}, |
72 |
|
73 |
/* double quote string */
|
74 |
[IN_DQ_UCODE3] = { |
75 |
['0' ... '9'] = IN_DQ_STRING, |
76 |
['a' ... 'f'] = IN_DQ_STRING, |
77 |
['A' ... 'F'] = IN_DQ_STRING, |
78 |
}, |
79 |
[IN_DQ_UCODE2] = { |
80 |
['0' ... '9'] = IN_DQ_UCODE3, |
81 |
['a' ... 'f'] = IN_DQ_UCODE3, |
82 |
['A' ... 'F'] = IN_DQ_UCODE3, |
83 |
}, |
84 |
[IN_DQ_UCODE1] = { |
85 |
['0' ... '9'] = IN_DQ_UCODE2, |
86 |
['a' ... 'f'] = IN_DQ_UCODE2, |
87 |
['A' ... 'F'] = IN_DQ_UCODE2, |
88 |
}, |
89 |
[IN_DQ_UCODE0] = { |
90 |
['0' ... '9'] = IN_DQ_UCODE1, |
91 |
['a' ... 'f'] = IN_DQ_UCODE1, |
92 |
['A' ... 'F'] = IN_DQ_UCODE1, |
93 |
}, |
94 |
[IN_DQ_STRING_ESCAPE] = { |
95 |
['b'] = IN_DQ_STRING,
|
96 |
['f'] = IN_DQ_STRING,
|
97 |
['n'] = IN_DQ_STRING,
|
98 |
['r'] = IN_DQ_STRING,
|
99 |
['t'] = IN_DQ_STRING,
|
100 |
['\''] = IN_DQ_STRING,
|
101 |
['\"'] = IN_DQ_STRING,
|
102 |
['u'] = IN_DQ_UCODE0,
|
103 |
}, |
104 |
[IN_DQ_STRING] = { |
105 |
[1 ... 0xFF] = IN_DQ_STRING, |
106 |
['\\'] = IN_DQ_STRING_ESCAPE,
|
107 |
['"'] = IN_DONE_STRING,
|
108 |
}, |
109 |
|
110 |
/* single quote string */
|
111 |
[IN_SQ_UCODE3] = { |
112 |
['0' ... '9'] = IN_SQ_STRING, |
113 |
['a' ... 'f'] = IN_SQ_STRING, |
114 |
['A' ... 'F'] = IN_SQ_STRING, |
115 |
}, |
116 |
[IN_SQ_UCODE2] = { |
117 |
['0' ... '9'] = IN_SQ_UCODE3, |
118 |
['a' ... 'f'] = IN_SQ_UCODE3, |
119 |
['A' ... 'F'] = IN_SQ_UCODE3, |
120 |
}, |
121 |
[IN_SQ_UCODE1] = { |
122 |
['0' ... '9'] = IN_SQ_UCODE2, |
123 |
['a' ... 'f'] = IN_SQ_UCODE2, |
124 |
['A' ... 'F'] = IN_SQ_UCODE2, |
125 |
}, |
126 |
[IN_SQ_UCODE0] = { |
127 |
['0' ... '9'] = IN_SQ_UCODE1, |
128 |
['a' ... 'f'] = IN_SQ_UCODE1, |
129 |
['A' ... 'F'] = IN_SQ_UCODE1, |
130 |
}, |
131 |
[IN_SQ_STRING_ESCAPE] = { |
132 |
['b'] = IN_SQ_STRING,
|
133 |
['f'] = IN_SQ_STRING,
|
134 |
['n'] = IN_SQ_STRING,
|
135 |
['r'] = IN_SQ_STRING,
|
136 |
['t'] = IN_SQ_STRING,
|
137 |
['\''] = IN_SQ_STRING,
|
138 |
['\"'] = IN_SQ_STRING,
|
139 |
['u'] = IN_SQ_UCODE0,
|
140 |
}, |
141 |
[IN_SQ_STRING] = { |
142 |
[1 ... 0xFF] = IN_SQ_STRING, |
143 |
['\\'] = IN_SQ_STRING_ESCAPE,
|
144 |
['\''] = IN_DONE_STRING,
|
145 |
}, |
146 |
|
147 |
/* Zero */
|
148 |
[IN_ZERO] = { |
149 |
TERMINAL(JSON_INTEGER), |
150 |
['0' ... '9'] = ERROR, |
151 |
['.'] = IN_MANTISSA,
|
152 |
}, |
153 |
|
154 |
/* Float */
|
155 |
[IN_DIGITS] = { |
156 |
TERMINAL(JSON_FLOAT), |
157 |
['0' ... '9'] = IN_DIGITS, |
158 |
}, |
159 |
|
160 |
[IN_DIGIT] = { |
161 |
['0' ... '9'] = IN_DIGITS, |
162 |
}, |
163 |
|
164 |
[IN_EXP_E] = { |
165 |
['-'] = IN_DIGIT,
|
166 |
['+'] = IN_DIGIT,
|
167 |
['0' ... '9'] = IN_DIGITS, |
168 |
}, |
169 |
|
170 |
[IN_MANTISSA_DIGITS] = { |
171 |
TERMINAL(JSON_FLOAT), |
172 |
['0' ... '9'] = IN_MANTISSA_DIGITS, |
173 |
['e'] = IN_EXP_E,
|
174 |
['E'] = IN_EXP_E,
|
175 |
}, |
176 |
|
177 |
[IN_MANTISSA] = { |
178 |
['0' ... '9'] = IN_MANTISSA_DIGITS, |
179 |
}, |
180 |
|
181 |
/* Number */
|
182 |
[IN_NONZERO_NUMBER] = { |
183 |
TERMINAL(JSON_INTEGER), |
184 |
['0' ... '9'] = IN_NONZERO_NUMBER, |
185 |
['e'] = IN_EXP_E,
|
186 |
['E'] = IN_EXP_E,
|
187 |
['.'] = IN_MANTISSA,
|
188 |
}, |
189 |
|
190 |
[IN_NEG_NONZERO_NUMBER] = { |
191 |
['0'] = IN_ZERO,
|
192 |
['1' ... '9'] = IN_NONZERO_NUMBER, |
193 |
}, |
194 |
|
195 |
/* keywords */
|
196 |
[IN_KEYWORD] = { |
197 |
TERMINAL(JSON_KEYWORD), |
198 |
['a' ... 'z'] = IN_KEYWORD, |
199 |
}, |
200 |
|
201 |
/* whitespace */
|
202 |
[IN_WHITESPACE] = { |
203 |
TERMINAL(JSON_SKIP), |
204 |
[' '] = IN_WHITESPACE,
|
205 |
['\t'] = IN_WHITESPACE,
|
206 |
['\r'] = IN_WHITESPACE,
|
207 |
['\n'] = IN_WHITESPACE,
|
208 |
}, |
209 |
|
210 |
/* operator */
|
211 |
[IN_OPERATOR_DONE] = { |
212 |
TERMINAL(JSON_OPERATOR), |
213 |
}, |
214 |
|
215 |
/* escape */
|
216 |
[IN_ESCAPE_DONE] = { |
217 |
TERMINAL(JSON_ESCAPE), |
218 |
}, |
219 |
|
220 |
[IN_ESCAPE_LL] = { |
221 |
['d'] = IN_ESCAPE_DONE,
|
222 |
}, |
223 |
|
224 |
[IN_ESCAPE_L] = { |
225 |
['d'] = IN_ESCAPE_DONE,
|
226 |
['l'] = IN_ESCAPE_LL,
|
227 |
}, |
228 |
|
229 |
[IN_ESCAPE_I64] = { |
230 |
['d'] = IN_ESCAPE_DONE,
|
231 |
}, |
232 |
|
233 |
[IN_ESCAPE_I6] = { |
234 |
['4'] = IN_ESCAPE_I64,
|
235 |
}, |
236 |
|
237 |
[IN_ESCAPE_I] = { |
238 |
['6'] = IN_ESCAPE_I6,
|
239 |
}, |
240 |
|
241 |
[IN_ESCAPE] = { |
242 |
['d'] = IN_ESCAPE_DONE,
|
243 |
['i'] = IN_ESCAPE_DONE,
|
244 |
['p'] = IN_ESCAPE_DONE,
|
245 |
['s'] = IN_ESCAPE_DONE,
|
246 |
['f'] = IN_ESCAPE_DONE,
|
247 |
['l'] = IN_ESCAPE_L,
|
248 |
['I'] = IN_ESCAPE_I,
|
249 |
}, |
250 |
|
251 |
/* top level rule */
|
252 |
[IN_START] = { |
253 |
['"'] = IN_DQ_STRING,
|
254 |
['\''] = IN_SQ_STRING,
|
255 |
['0'] = IN_ZERO,
|
256 |
['1' ... '9'] = IN_NONZERO_NUMBER, |
257 |
['-'] = IN_NEG_NONZERO_NUMBER,
|
258 |
['{'] = IN_OPERATOR_DONE,
|
259 |
['}'] = IN_OPERATOR_DONE,
|
260 |
['['] = IN_OPERATOR_DONE,
|
261 |
[']'] = IN_OPERATOR_DONE,
|
262 |
[','] = IN_OPERATOR_DONE,
|
263 |
[':'] = IN_OPERATOR_DONE,
|
264 |
['a' ... 'z'] = IN_KEYWORD, |
265 |
['%'] = IN_ESCAPE,
|
266 |
[' '] = IN_WHITESPACE,
|
267 |
['\t'] = IN_WHITESPACE,
|
268 |
['\r'] = IN_WHITESPACE,
|
269 |
['\n'] = IN_WHITESPACE,
|
270 |
}, |
271 |
}; |
272 |
|
273 |
void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
|
274 |
{ |
275 |
lexer->emit = func; |
276 |
lexer->state = IN_START; |
277 |
lexer->token = qstring_new(); |
278 |
} |
279 |
|
280 |
static int json_lexer_feed_char(JSONLexer *lexer, char ch) |
281 |
{ |
282 |
char buf[2]; |
283 |
|
284 |
lexer->x++; |
285 |
if (ch == '\n') { |
286 |
lexer->x = 0;
|
287 |
lexer->y++; |
288 |
} |
289 |
|
290 |
lexer->state = json_lexer[lexer->state][(uint8_t)ch]; |
291 |
|
292 |
switch (lexer->state) {
|
293 |
case JSON_OPERATOR:
|
294 |
case JSON_ESCAPE:
|
295 |
case JSON_INTEGER:
|
296 |
case JSON_FLOAT:
|
297 |
case JSON_KEYWORD:
|
298 |
case JSON_STRING:
|
299 |
lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y); |
300 |
case JSON_SKIP:
|
301 |
lexer->state = json_lexer[IN_START][(uint8_t)ch]; |
302 |
QDECREF(lexer->token); |
303 |
lexer->token = qstring_new(); |
304 |
break;
|
305 |
case ERROR:
|
306 |
return -EINVAL;
|
307 |
default:
|
308 |
break;
|
309 |
} |
310 |
|
311 |
buf[0] = ch;
|
312 |
buf[1] = 0; |
313 |
|
314 |
qstring_append(lexer->token, buf); |
315 |
|
316 |
return 0; |
317 |
} |
318 |
|
319 |
int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) |
320 |
{ |
321 |
size_t i; |
322 |
|
323 |
for (i = 0; i < size; i++) { |
324 |
int err;
|
325 |
|
326 |
err = json_lexer_feed_char(lexer, buffer[i]); |
327 |
if (err < 0) { |
328 |
return err;
|
329 |
} |
330 |
} |
331 |
|
332 |
return 0; |
333 |
} |
334 |
|
335 |
int json_lexer_flush(JSONLexer *lexer)
|
336 |
{ |
337 |
return json_lexer_feed_char(lexer, 0); |
338 |
} |
339 |
|
340 |
void json_lexer_destroy(JSONLexer *lexer)
|
341 |
{ |
342 |
QDECREF(lexer->token); |
343 |
} |