[v1] json: Fixes, error reporting improvements, cleanups

[Qemu-devel] [PATCH 25/56] json: Leave rejecting invalid escape sequences to parser

Posted by Markus Armbruster 7 years, 6 months ago

Both lexer and parser reject invalid escape sequences in strings.  The
parser's check is useless.

The lexer ends the token right after the first non-well-formed byte.
This tends to lead to suboptimal error reporting.  For instance, input

    {"abc\@ijk": 1}

produces the tokens

    JSON_LCURLY   {
    JSON_ERROR    "abc\@
    JSON_KEYWORD  ijk
    JSON_ERROR   ": 1}\n

The parser then reports three errors

    Invalid JSON syntax
    JSON parse error, invalid keyword 'ijk'
    Invalid JSON syntax

before it recovers at the newline.

Drop the lexer's escape sequence checking, and make it accept the same
characters after '\' it accepts elsewhere in strings.  It now produces

    JSON_LCURLY   {
    JSON_STRING   "abc\@ijk"
    JSON_COLON    :
    JSON_INTEGER  1
    JSON_RCURLY

and the parser reports just

    JSON parse error, invalid escape sequence in string

While there, fix parse_string()'s inaccurate function comment.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
---
 qobject/json-lexer.c  | 72 +++----------------------------------------
 qobject/json-parser.c | 56 +++++++++++++++++++--------------
 2 files changed, 37 insertions(+), 91 deletions(-)

diff --git a/qobject/json-lexer.c b/qobject/json-lexer.c
index 36fb665b12..af0a7fdb8a 100644
--- a/qobject/json-lexer.c
+++ b/qobject/json-lexer.c
@@ -80,6 +80,8 @@
  *    escape = %x5C              ; \
  *    quotation-mark = %x22      ; "
  *    unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
+ *    [This lexer accepts any non-control character after escape, and
+ *    leaves rejecting invalid ones to the parser.]
  *
  *
  * Extensions over RFC 7159:
@@ -99,16 +101,8 @@
 
 enum json_lexer_state {
     IN_ERROR = 0,               /* must really be 0, see json_lexer[] */
-    IN_DQ_UCODE3,
-    IN_DQ_UCODE2,
-    IN_DQ_UCODE1,
-    IN_DQ_UCODE0,
     IN_DQ_STRING_ESCAPE,
     IN_DQ_STRING,
-    IN_SQ_UCODE3,
-    IN_SQ_UCODE2,
-    IN_SQ_UCODE1,
-    IN_SQ_UCODE0,
     IN_SQ_STRING_ESCAPE,
     IN_SQ_STRING,
     IN_ZERO,
@@ -144,37 +138,8 @@ static const uint8_t json_lexer[][256] =  {
     /* Relies on default initialization to IN_ERROR! */
 
     /* double quote string */
-    [IN_DQ_UCODE3] = {
-        ['0' ... '9'] = IN_DQ_STRING,
-        ['a' ... 'f'] = IN_DQ_STRING,
-        ['A' ... 'F'] = IN_DQ_STRING,
-    },
-    [IN_DQ_UCODE2] = {
-        ['0' ... '9'] = IN_DQ_UCODE3,
-        ['a' ... 'f'] = IN_DQ_UCODE3,
-        ['A' ... 'F'] = IN_DQ_UCODE3,
-    },
-    [IN_DQ_UCODE1] = {
-        ['0' ... '9'] = IN_DQ_UCODE2,
-        ['a' ... 'f'] = IN_DQ_UCODE2,
-        ['A' ... 'F'] = IN_DQ_UCODE2,
-    },
-    [IN_DQ_UCODE0] = {
-        ['0' ... '9'] = IN_DQ_UCODE1,
-        ['a' ... 'f'] = IN_DQ_UCODE1,
-        ['A' ... 'F'] = IN_DQ_UCODE1,
-    },
     [IN_DQ_STRING_ESCAPE] = {
-        ['b'] = IN_DQ_STRING,
-        ['f'] =  IN_DQ_STRING,
-        ['n'] =  IN_DQ_STRING,
-        ['r'] =  IN_DQ_STRING,
-        ['t'] =  IN_DQ_STRING,
-        ['/'] = IN_DQ_STRING,
-        ['\\'] = IN_DQ_STRING,
-        ['\''] = IN_DQ_STRING,
-        ['\"'] = IN_DQ_STRING,
-        ['u'] = IN_DQ_UCODE0,
+        [0x20 ... 0xFD] = IN_DQ_STRING,
     },
     [IN_DQ_STRING] = {
         [0x20 ... 0xFD] = IN_DQ_STRING,
@@ -183,37 +148,8 @@ static const uint8_t json_lexer[][256] =  {
     },
 
     /* single quote string */
-    [IN_SQ_UCODE3] = {
-        ['0' ... '9'] = IN_SQ_STRING,
-        ['a' ... 'f'] = IN_SQ_STRING,
-        ['A' ... 'F'] = IN_SQ_STRING,
-    },
-    [IN_SQ_UCODE2] = {
-        ['0' ... '9'] = IN_SQ_UCODE3,
-        ['a' ... 'f'] = IN_SQ_UCODE3,
-        ['A' ... 'F'] = IN_SQ_UCODE3,
-    },
-    [IN_SQ_UCODE1] = {
-        ['0' ... '9'] = IN_SQ_UCODE2,
-        ['a' ... 'f'] = IN_SQ_UCODE2,
-        ['A' ... 'F'] = IN_SQ_UCODE2,
-    },
-    [IN_SQ_UCODE0] = {
-        ['0' ... '9'] = IN_SQ_UCODE1,
-        ['a' ... 'f'] = IN_SQ_UCODE1,
-        ['A' ... 'F'] = IN_SQ_UCODE1,
-    },
     [IN_SQ_STRING_ESCAPE] = {
-        ['b'] = IN_SQ_STRING,
-        ['f'] =  IN_SQ_STRING,
-        ['n'] =  IN_SQ_STRING,
-        ['r'] =  IN_SQ_STRING,
-        ['t'] =  IN_SQ_STRING,
-        ['/'] = IN_SQ_STRING,
-        ['\\'] = IN_SQ_STRING,
-        ['\''] = IN_SQ_STRING,
-        ['\"'] = IN_SQ_STRING,
-        ['u'] = IN_SQ_UCODE0,
+        [0x20 ... 0xFD] = IN_SQ_STRING,
     },
     [IN_SQ_STRING] = {
         [0x20 ... 0xFD] = IN_SQ_STRING,
diff --git a/qobject/json-parser.c b/qobject/json-parser.c
index 14225c3c09..d469004616 100644
--- a/qobject/json-parser.c
+++ b/qobject/json-parser.c
@@ -106,30 +106,40 @@ static int hex2decimal(char ch)
 }
 
 /**
- * parse_string(): Parse a json string and return a QObject
+ * parse_string(): Parse a JSON string
  *
- *  string
- *      ""
- *      " chars "
- *  chars
- *      char
- *      char chars
- *  char
- *      any-Unicode-character-
- *          except-"-or-\-or-
- *          control-character
- *      \"
- *      \\
- *      \/
- *      \b
- *      \f
- *      \n
- *      \r
- *      \t
- *      \u four-hex-digits 
+ * From RFC 7159 "The JavaScript Object Notation (JSON) Data
+ * Interchange Format":
+ *
+ *    char = unescaped /
+ *        escape (
+ *            %x22 /          ; "    quotation mark  U+0022
+ *            %x5C /          ; \    reverse solidus U+005C
+ *            %x2F /          ; /    solidus         U+002F
+ *            %x62 /          ; b    backspace       U+0008
+ *            %x66 /          ; f    form feed       U+000C
+ *            %x6E /          ; n    line feed       U+000A
+ *            %x72 /          ; r    carriage return U+000D
+ *            %x74 /          ; t    tab             U+0009
+ *            %x75 4HEXDIG )  ; uXXXX                U+XXXX
+ *    escape = %x5C              ; \
+ *    quotation-mark = %x22      ; "
+ *    unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
+ *
+ * Extensions over RFC 7159:
+ * - Extra escape sequence in strings:
+ *   0x27 (apostrophe) is recognized after escape, too
+ * - Single-quoted strings:
+ *   Like double-quoted strings, except they're delimited by %x27
+ *   (apostrophe) instead of %x22 (quotation mark), and can't contain
+ *   unescaped apostrophe, but can contain unescaped quotation mark.
+ *
+ * Note:
+ * - Encoding is modified UTF-8.
+ * - Invalid Unicode characters are rejected.
+ * - Control characters are rejected by the lexer.
  */
-static QString *qstring_from_escaped_str(JSONParserContext *ctxt,
-                                         JSONToken *token)
+static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
 {
     const char *ptr = token->str;
     QString *str;
@@ -494,7 +504,7 @@ static QObject *parse_literal(JSONParserContext *ctxt)
 
     switch (token->type) {
     case JSON_STRING:
-        return QOBJECT(qstring_from_escaped_str(ctxt, token));
+        return QOBJECT(parse_string(ctxt, token));
     case JSON_INTEGER: {
         /*
          * Represent JSON_INTEGER as QNUM_I64 if possible, else as
-- 
2.17.1

Re: [Qemu-devel] [PATCH 25/56] json: Leave rejecting invalid escape sequences to parser

Posted by Eric Blake 7 years, 6 months ago

On 08/08/2018 07:03 AM, Markus Armbruster wrote:
> Both lexer and parser reject invalid escape sequences in strings.  The
> parser's check is useless.
> 

> 
> Drop the lexer's escape sequence checking, and make it accept the same
> characters after '\' it accepts elsewhere in strings.  It now produces
> 
>      JSON_LCURLY   {
>      JSON_STRING   "abc\@ijk"
>      JSON_COLON    :
>      JSON_INTEGER  1
>      JSON_RCURLY
> 
> and the parser reports just
> 
>      JSON parse error, invalid escape sequence in string
> 
> While there, fix parse_string()'s inaccurate function comment.

Worthwhile improvement.

> 
> Signed-off-by: Markus Armbruster <armbru@redhat.com>
> ---
>   qobject/json-lexer.c  | 72 +++----------------------------------------
>   qobject/json-parser.c | 56 +++++++++++++++++++--------------
>   2 files changed, 37 insertions(+), 91 deletions(-)

and shorter!

>       [IN_DQ_STRING_ESCAPE] = {
> -        ['b'] = IN_DQ_STRING,
> -        ['f'] =  IN_DQ_STRING,
> -        ['n'] =  IN_DQ_STRING,
> -        ['r'] =  IN_DQ_STRING,
> -        ['t'] =  IN_DQ_STRING,
> -        ['/'] = IN_DQ_STRING,
> -        ['\\'] = IN_DQ_STRING,
> -        ['\''] = IN_DQ_STRING,
> -        ['\"'] = IN_DQ_STRING,
> -        ['u'] = IN_DQ_UCODE0,
> +        [0x20 ... 0xFD] = IN_DQ_STRING,

Among other things, this means the parser now has to flag "\u" as an 
incomplete escape - but your added testsuite coverage earlier in the 
series ensures that we do.

> +++ b/qobject/json-parser.c
> @@ -106,30 +106,40 @@ static int hex2decimal(char ch)
>   }
>   
>   /**
> - * parse_string(): Parse a json string and return a QObject
> + * parse_string(): Parse a JSON string
>    *
> - *  string

> + * From RFC 7159 "The JavaScript Object Notation (JSON) Data
> + * Interchange Format":
> + *
> + *    char = unescaped /
> + *        escape (
> + *            %x22 /          ; "    quotation mark  U+0022
> + *            %x5C /          ; \    reverse solidus U+005C
> + *            %x2F /          ; /    solidus         U+002F
> + *            %x62 /          ; b    backspace       U+0008
> + *            %x66 /          ; f    form feed       U+000C
> + *            %x6E /          ; n    line feed       U+000A
> + *            %x72 /          ; r    carriage return U+000D
> + *            %x74 /          ; t    tab             U+0009
> + *            %x75 4HEXDIG )  ; uXXXX                U+XXXX
> + *    escape = %x5C              ; \
> + *    quotation-mark = %x22      ; "
> + *    unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
> + *
> + * Extensions over RFC 7159:
> + * - Extra escape sequence in strings:
> + *   0x27 (apostrophe) is recognized after escape, too
> + * - Single-quoted strings:
> + *   Like double-quoted strings, except they're delimited by %x27
> + *   (apostrophe) instead of %x22 (quotation mark), and can't contain
> + *   unescaped apostrophe, but can contain unescaped quotation mark.
> + *
> + * Note:
> + * - Encoding is modified UTF-8.

That is an extension over RFC 7159. But I'm okay with leaving it in the 
Notes section.

> + * - Invalid Unicode characters are rejected.
> + * - Control characters are rejected by the lexer.

Worth being explicit that this is 00-1f, fe, and ff?

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [Qemu-devel] [PATCH 25/56] json: Leave rejecting invalid escape sequences to parser

Posted by Markus Armbruster 7 years, 6 months ago

Eric Blake <eblake@redhat.com> writes:

> On 08/08/2018 07:03 AM, Markus Armbruster wrote:
>> Both lexer and parser reject invalid escape sequences in strings.  The
>> parser's check is useless.
>>
>
>>
>> Drop the lexer's escape sequence checking, and make it accept the same
>> characters after '\' it accepts elsewhere in strings.  It now produces
>>
>>      JSON_LCURLY   {
>>      JSON_STRING   "abc\@ijk"
>>      JSON_COLON    :
>>      JSON_INTEGER  1
>>      JSON_RCURLY
>>
>> and the parser reports just
>>
>>      JSON parse error, invalid escape sequence in string
>>
>> While there, fix parse_string()'s inaccurate function comment.
>
> Worthwhile improvement.
>
>>
>> Signed-off-by: Markus Armbruster <armbru@redhat.com>
>> ---
>>   qobject/json-lexer.c  | 72 +++----------------------------------------
>>   qobject/json-parser.c | 56 +++++++++++++++++++--------------
>>   2 files changed, 37 insertions(+), 91 deletions(-)
>
> and shorter!
>
>>       [IN_DQ_STRING_ESCAPE] = {
>> -        ['b'] = IN_DQ_STRING,
>> -        ['f'] =  IN_DQ_STRING,
>> -        ['n'] =  IN_DQ_STRING,
>> -        ['r'] =  IN_DQ_STRING,
>> -        ['t'] =  IN_DQ_STRING,
>> -        ['/'] = IN_DQ_STRING,
>> -        ['\\'] = IN_DQ_STRING,
>> -        ['\''] = IN_DQ_STRING,
>> -        ['\"'] = IN_DQ_STRING,
>> -        ['u'] = IN_DQ_UCODE0,
>> +        [0x20 ... 0xFD] = IN_DQ_STRING,
>
> Among other things, this means the parser now has to flag "\u" as an
> incomplete escape - but your added testsuite coverage earlier in the
> series ensures that we do.

Yes.

>> +++ b/qobject/json-parser.c
>> @@ -106,30 +106,40 @@ static int hex2decimal(char ch)
>>   }
>>     /**
>> - * parse_string(): Parse a json string and return a QObject
>> + * parse_string(): Parse a JSON string
>>    *
>> - *  string
>
>> + * From RFC 7159 "The JavaScript Object Notation (JSON) Data
>> + * Interchange Format":
>> + *
>> + *    char = unescaped /
>> + *        escape (
>> + *            %x22 /          ; "    quotation mark  U+0022
>> + *            %x5C /          ; \    reverse solidus U+005C
>> + *            %x2F /          ; /    solidus         U+002F
>> + *            %x62 /          ; b    backspace       U+0008
>> + *            %x66 /          ; f    form feed       U+000C
>> + *            %x6E /          ; n    line feed       U+000A
>> + *            %x72 /          ; r    carriage return U+000D
>> + *            %x74 /          ; t    tab             U+0009
>> + *            %x75 4HEXDIG )  ; uXXXX                U+XXXX
>> + *    escape = %x5C              ; \
>> + *    quotation-mark = %x22      ; "
>> + *    unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
>> + *
>> + * Extensions over RFC 7159:
>> + * - Extra escape sequence in strings:
>> + *   0x27 (apostrophe) is recognized after escape, too
>> + * - Single-quoted strings:
>> + *   Like double-quoted strings, except they're delimited by %x27
>> + *   (apostrophe) instead of %x22 (quotation mark), and can't contain
>> + *   unescaped apostrophe, but can contain unescaped quotation mark.
>> + *
>> + * Note:
>> + * - Encoding is modified UTF-8.
>
> That is an extension over RFC 7159. But I'm okay with leaving it in
> the Notes section.
>
>> + * - Invalid Unicode characters are rejected.
>> + * - Control characters are rejected by the lexer.
>
> Worth being explicit that this is 00-1f, fe, and ff?

\xFE and \xFF are invalid, not control.

What about:

 * - Invalid Unicode characters are rejected.
 * - Control characters \x00..\x1F are rejected by the lexer.

Re: [Qemu-devel] [PATCH 25/56] json: Leave rejecting invalid escape sequences to parser

Posted by Eric Blake 7 years, 5 months ago

On 08/13/2018 02:05 AM, Markus Armbruster wrote:
> Eric Blake <eblake@redhat.com> writes:
> 
>> On 08/08/2018 07:03 AM, Markus Armbruster wrote:
>>> Both lexer and parser reject invalid escape sequences in strings.  The
>>> parser's check is useless.
>>>

>>> + * Extensions over RFC 7159:
>>> + * - Extra escape sequence in strings:
>>> + *   0x27 (apostrophe) is recognized after escape, too
>>> + * - Single-quoted strings:
>>> + *   Like double-quoted strings, except they're delimited by %x27
>>> + *   (apostrophe) instead of %x22 (quotation mark), and can't contain
>>> + *   unescaped apostrophe, but can contain unescaped quotation mark.
>>> + *
>>> + * Note:
>>> + * - Encoding is modified UTF-8.
>>
>> That is an extension over RFC 7159. But I'm okay with leaving it in
>> the Notes section.
>>
>>> + * - Invalid Unicode characters are rejected.
>>> + * - Control characters are rejected by the lexer.
>>
>> Worth being explicit that this is 00-1f, fe, and ff?
> 
> \xFE and \xFF are invalid, not control.
> 
> What about:
> 
>   * - Invalid Unicode characters are rejected.
>   * - Control characters \x00..\x1F are rejected by the lexer.

Works for me.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3266
Virtualization:  qemu.org | libvirt.org