[v2] kernel-doc: use a C lexical tokenizer for transforms

[PATCH v2 05/28] docs: kdoc_re: add a C tokenizer

Posted by Mauro Carvalho Chehab 3 weeks, 4 days ago

Handling C code purely using regular expressions doesn't work well.

Add a C tokenizer to help doing it the right way.

The tokenizer was written using as basis the Python re documentation
tokenizer example from:
	https://docs.python.org/3/library/re.html#writing-a-tokenizer

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Message-ID: <c63ad36c81fe043e9e33ca55630414893f127413.1773074166.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_re.py | 234 +++++++++++++++++++++++++++++++
 1 file changed, 234 insertions(+)

diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
index 085b89a4547c..7bed4e9a8810 100644
--- a/tools/lib/python/kdoc/kdoc_re.py
+++ b/tools/lib/python/kdoc/kdoc_re.py
@@ -141,6 +141,240 @@ class KernRe:
 
         return self.last_match.groups()
 
+class TokType():
+
+    @staticmethod
+    def __str__(val):
+        """Return the name of an enum value"""
+        return TokType._name_by_val.get(val, f"UNKNOWN({val})")
+
+class CToken():
+    """
+    Data class to define a C token.
+    """
+
+    # Tokens that can be used by the parser. Works like an C enum.
+
+    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
+    STRING = 1      #: A string, including quotation marks.
+    CHAR = 2        #: A character, including apostophes.
+    NUMBER = 3      #: A number.
+    PUNC = 4        #: A puntuation mark: ``;`` / ``,`` / ``.``.
+    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
+    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
+    CPP = 7         #: A preprocessor macro.
+    HASH = 8        #: The hash character - useful to handle other macros.
+    OP = 9          #: A C operator (add, subtract, ...).
+    STRUCT = 10     #: A ``struct`` keyword.
+    UNION = 11      #: An ``union`` keyword.
+    ENUM = 12       #: A ``struct`` keyword.
+    TYPEDEF = 13    #: A ``typedef`` keyword.
+    NAME = 14       #: A name. Can be an ID or a type.
+    SPACE = 15      #: Any space characters, including new lines
+
+    MISMATCH = 255  #: an error indicator: should never happen in practice.
+
+    # Dict to convert from an enum interger into a string.
+    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
+
+    # Dict to convert from string to an enum-like integer value.
+    _name_to_val = {k: v for v, k in _name_by_val.items()}
+
+    @staticmethod
+    def to_name(val):
+        """Convert from an integer value from CToken enum into a string"""
+
+        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
+
+    @staticmethod
+    def from_name(name):
+        """Convert a string into a CToken enum value"""
+        if name in CToken._name_to_val:
+            return CToken._name_to_val[name]
+
+        return CToken.MISMATCH
+
+    def __init__(self, kind, value, pos,
+                 brace_level, paren_level, bracket_level):
+        self.kind = kind
+        self.value = value
+        self.pos = pos
+        self.brace_level = brace_level
+        self.paren_level = paren_level
+        self.bracket_level = bracket_level
+
+    def __repr__(self):
+        name = self.to_name(self.kind)
+        if isinstance(self.value, str):
+            value = '"' + self.value + '"'
+        else:
+            value = self.value
+
+        return f"CToken({name}, {value}, {self.pos}, " \
+               f"{self.brace_level}, {self.paren_level}, {self.bracket_level})"
+
+#: Tokens to parse C code.
+TOKEN_LIST = [
+    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
+
+    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
+    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
+
+    (CToken.NUMBER,  r"0[xX][0-9a-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
+                     r"[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?[fFlL]*"),
+
+    (CToken.PUNC,    r"[;,\.]"),
+
+    (CToken.BEGIN,   r"[\[\(\{]"),
+
+    (CToken.END,     r"[\]\)\}]"),
+
+    (CToken.CPP,     r"#\s*(define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
+
+    (CToken.HASH,    r"#"),
+
+    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
+                     r"|&=|\|=|\^=|=|\+|\-|\*|/|%|<|>|&|\||\^|~|!|\?|\:"),
+
+    (CToken.STRUCT,  r"\bstruct\b"),
+    (CToken.UNION,   r"\bunion\b"),
+    (CToken.ENUM,    r"\benum\b"),
+    (CToken.TYPEDEF, r"\bkinddef\b"),
+
+    (CToken.NAME,      r"[A-Za-z_][A-Za-z0-9_]*"),
+
+    (CToken.SPACE,   r"[\s]+"),
+
+    (CToken.MISMATCH,r"."),
+]
+
+#: Handle C continuation lines.
+RE_CONT = KernRe(r"\\\n")
+
+RE_COMMENT_START = KernRe(r'/\*\s*')
+
+#: tokenizer regex. Will be filled at the first CTokenizer usage.
+re_scanner = None
+
+class CTokenizer():
+    """
+    Scan C statements and definitions and produce tokens.
+
+    When converted to string, it drops comments and handle public/private
+    values, respecting depth.
+    """
+
+    # This class is inspired and follows the basic concepts of:
+    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
+
+    def _tokenize(self, source):
+        """
+        Interactor that parses ``source``, splitting it into tokens, as defined
+        at ``self.TOKEN_LIST``.
+
+        The interactor returns a CToken class object.
+        """
+
+        # Handle continuation lines. Note that kdoc_parser already has a
+        # logic to do that. Still, let's keep it for completeness, as we might
+        # end re-using this tokenizer outsize kernel-doc some day - or we may
+        # eventually remove from there as a future cleanup.
+        source = RE_CONT.sub("", source)
+
+        brace_level = 0
+        paren_level = 0
+        bracket_level = 0
+
+        for match in re_scanner.finditer(source):
+            kind = CToken.from_name(match.lastgroup)
+            pos = match.start()
+            value = match.group()
+
+            if kind == CToken.MISMATCH:
+                raise RuntimeError(f"Unexpected token '{value}' on {pos}:\n\t{source}")
+            elif kind == CToken.BEGIN:
+                if value == '(':
+                    paren_level += 1
+                elif value == '[':
+                    bracket_level += 1
+                else:  # value == '{'
+                    brace_level += 1
+
+            elif kind == CToken.END:
+                if value == ')' and paren_level > 0:
+                    paren_level -= 1
+                elif value == ']' and bracket_level > 0:
+                    bracket_level -= 1
+                elif brace_level > 0:    # value == '}'
+                    brace_level -= 1
+
+            yield CToken(kind, value, pos,
+                         brace_level, paren_level, bracket_level)
+
+    def __init__(self, source):
+        """
+        Create a regular expression to handle TOKEN_LIST.
+
+        While I generally don't like using regex group naming via:
+            (?P<name>...)
+
+        in this particular case, it makes sense, as we can pick the name
+        when matching a code via re_scanner().
+        """
+        global re_scanner
+
+        if not re_scanner:
+            re_tokens = []
+
+            for kind, pattern in TOKEN_LIST:
+                name = CToken.to_name(kind)
+                re_tokens.append(f"(?P<{name}>{pattern})")
+
+            re_scanner = KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
+
+        self.tokens = []
+        for tok in self._tokenize(source):
+            self.tokens.append(tok)
+
+    def __str__(self):
+        out=""
+        show_stack = [True]
+
+        for tok in self.tokens:
+            if tok.kind == CToken.BEGIN:
+                show_stack.append(show_stack[-1])
+
+            elif tok.kind == CToken.END:
+                prev = show_stack[-1]
+                if len(show_stack) > 1:
+                    show_stack.pop()
+
+                if not prev and show_stack[-1]:
+                    #
+                    # Try to preserve indent
+                    #
+                    out += "\t" * (len(show_stack) - 1)
+
+                    out += str(tok.value)
+                    continue
+
+            elif tok.kind == CToken.COMMENT:
+                comment = RE_COMMENT_START.sub("", tok.value)
+
+                if comment.startswith("private:"):
+                    show_stack[-1] = False
+                    show = False
+                elif comment.startswith("public:"):
+                    show_stack[-1] = True
+
+                continue
+
+            if show_stack[-1]:
+                    out += str(tok.value)
+
+        return out
+
+
 #: Nested delimited pairs (brackets and parenthesis)
 DELIMITER_PAIRS = {
     '{': '}',
-- 
2.52.0

Re: [PATCH v2 05/28] docs: kdoc_re: add a C tokenizer

Posted by Jonathan Corbet 3 weeks ago

On Thu, 12 Mar 2026 15:54:25 +0100, Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> Handling C code purely using regular expressions doesn't work well.
> 
> Add a C tokenizer to help doing it the right way.
> 
> The tokenizer was written using as basis the Python re documentation
> tokenizer example from:
> 	https://docs.python.org/3/library/re.html#writing-a-tokenizer
> 
> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> Message-ID: <c63ad36c81fe043e9e33ca55630414893f127413.1773074166.git.mchehab+huawei@kernel.org>
> Message-ID: <8541ffa469647db1a7154f274fb2d55b4c127dcb.1773326442.git.mchehab+huawei@kernel.org>

This is a combined effort to review this patch and to try out "b4 review",
we'll see how it goes :).

> diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
> index 085b89a4547c0..7bed4e9a88108 100644
> --- a/tools/lib/python/kdoc/kdoc_re.py
> +++ b/tools/lib/python/kdoc/kdoc_re.py
> @@ -141,6 +141,240 @@ class KernRe:
> [ ... skip 4 lines ... ]
> +
> +    @staticmethod
> +    def __str__(val):
> +        """Return the name of an enum value"""
> +        return TokType._name_by_val.get(val, f"UNKNOWN({val})")
> +

What is this class supposed to do?

> [ ... skip 27 lines ... ]
> +    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
> +
> +    # Dict to convert from string to an enum-like integer value.
> +    _name_to_val = {k: v for v, k in _name_by_val.items()}
> +
> +    @staticmethod

This stuff strikes me as a bit overdone; _name_to_val is really just the
variable list for the class, right?

> [ ... skip 30 lines ... ]
> +               f"{self.brace_level}, {self.paren_level}, {self.bracket_level})"
> +
> +#: Tokens to parse C code.
> +TOKEN_LIST = [
> +    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
> +

So these aren't "tokens", this is a list of regexes; how is it intended
to be used?

> +    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
> +    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
> +
> +    (CToken.NUMBER,  r"0[xX][0-9a-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"

How does "[\s\S]*" differ from plain old "*" ?

> [ ... skip 15 lines ... ]
> +    (CToken.STRUCT,  r"\bstruct\b"),
> +    (CToken.UNION,   r"\bunion\b"),
> +    (CToken.ENUM,    r"\benum\b"),
> +    (CToken.TYPEDEF, r"\bkinddef\b"),
> +
> +    (CToken.NAME,      r"[A-Za-z_][A-Za-z0-9_]*"),

"-" and "!" never need to be escaped.

> +
> +    (CToken.SPACE,   r"[\s]+"),
> +
> +    (CToken.MISMATCH,r"."),
> +]
> +

"kinddef" ?

> +#: Handle C continuation lines.
> +RE_CONT = KernRe(r"\\\n")
> +
> +RE_COMMENT_START = KernRe(r'/\*\s*')
> +

Don't need the [brackets] here

> [ ... skip 6 lines ... ]
> +
> +    When converted to string, it drops comments and handle public/private
> +    values, respecting depth.
> +    """
> +
> +    # This class is inspired and follows the basic concepts of:

That seems weird, why don't you just initialize it here?

> [ ... skip 14 lines ... ]
> +        source = RE_CONT.sub("", source)
> +
> +        brace_level = 0
> +        paren_level = 0
> +        bracket_level = 0
> +

Do you mean "iterator" here?

> [ ... skip 33 lines ... ]
> +        in this particular case, it makes sense, as we can pick the name
> +        when matching a code via re_scanner().
> +        """
> +        global re_scanner
> +
> +        if not re_scanner:

Putting __init__() first is fairly standard, methinks.

> [ ... skip 15 lines ... ]
> +
> +        for tok in self.tokens:
> +            if tok.kind == CToken.BEGIN:
> +                show_stack.append(show_stack[-1])
> +
> +            elif tok.kind == CToken.END:

I still don't understand why you do this here - this is all constant, right?

> +                prev = show_stack[-1]
> +                if len(show_stack) > 1:
> +                    show_stack.pop()
> +
> +                if not prev and show_stack[-1]:

So you create a nice iterator structure, then just put it all together into a
list anyway?

-- 
Jonathan Corbet <corbet@lwn.net>

Re: [PATCH v2 05/28] docs: kdoc_re: add a C tokenizer

Posted by Mauro Carvalho Chehab 3 weeks ago

On Mon, 16 Mar 2026 17:01:11 -0600
Jonathan Corbet <corbet@lwn.net> wrote:

> On Thu, 12 Mar 2026 15:54:25 +0100, Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> > Handling C code purely using regular expressions doesn't work well.
> > 
> > Add a C tokenizer to help doing it the right way.
> > 
> > The tokenizer was written using as basis the Python re documentation
> > tokenizer example from:
> > 	https://docs.python.org/3/library/re.html#writing-a-tokenizer
> > 
> > Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > Message-ID: <c63ad36c81fe043e9e33ca55630414893f127413.1773074166.git.mchehab+huawei@kernel.org>
> > Message-ID: <8541ffa469647db1a7154f274fb2d55b4c127dcb.1773326442.git.mchehab+huawei@kernel.org>  
> 
> This is a combined effort to review this patch and to try out "b4 review",
> we'll see how it goes :).
> 
> > diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
> > index 085b89a4547c0..7bed4e9a88108 100644
> > --- a/tools/lib/python/kdoc/kdoc_re.py
> > +++ b/tools/lib/python/kdoc/kdoc_re.py
> > @@ -141,6 +141,240 @@ class KernRe:
> > [ ... skip 4 lines ... ]
> > +
> > +    @staticmethod
> > +    def __str__(val):
> > +        """Return the name of an enum value"""
> > +        return TokType._name_by_val.get(val, f"UNKNOWN({val})")
> > +  
> 
> What is this class supposed to do?

This __str__() method ensures that, when printing a CToken object,
the name will be displayed, instead of a number. This is really
useful when debugging.

See, if I add a print:

<snip>
--- a/tools/lib/python/kdoc/kdoc_parser.py
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -87,6 +87,7 @@ def trim_private_members(text):
     """
 
     tokens = CTokenizer(text)
+    print(tokens.tokens)
     return str(tokens)
 
</snip>

the tokens will appear as names at the output:

$ ./scripts/kernel-doc -none er.c
[CToken(CToken.ENUM, "enum", 0, (0, 0, 0)), CToken(CToken.SPACE, " ", 4, (0, 0, 0)), CToken(CToken.NAME, "dmub_abm_ace_curve_type", 5, (0, 0, 0)), CToken(CToken.SPACE, " ", 28, (0, 0, 0)), CToken(CToken.BEGIN, "{", 29, (0, 0, 1)), CToken(CToken.SPACE, " ", 30, (0, 0, 1)), CToken(CToken.COMMENT, "/**
         * ACE curve as defined by the SW layer. */", 31, (0, 0, 1)), CToken(CToken.SPACE, " ", 86, (0, 0, 1)), CToken(CToken.NAME, "ABM_ACE_CURVE_TYPE__SW", 87, (0, 0, 1)), CToken(CToken.SPACE, " ", 109, (0, 0, 1)), CToken(CToken.OP, "=", 110, (0, 0, 1)), CToken(CToken.SPACE, " ", 111, (0, 0, 1)), CToken(CToken.NUMBER, "0", 112, (0, 0, 1)), CToken(CToken.PUNC, ",", 113, (0, 0, 1)), CToken(CToken.SPACE, " ", 114, (0, 0, 1)), CToken(CToken.COMMENT, "/**
         * ACE curve as defined by the SW to HW translation interface layer. */", 115, (0, 0, 1)), CToken(CToken.SPACE, " ", 198, (0, 0, 1)), CToken(CToken.NAME, "ABM_ACE_CURVE_TYPE__SW_IF", 199, (0, 0, 1)), CToken(CToken.SPACE, " ", 224, (0, 0, 1)), CToken(CToken.OP, "=", 225, (0, 0, 1)), CToken(CToken.SPACE, " ", 226, (0, 0, 1)), CToken(CToken.NUMBER, "1", 227, (0, 0, 1)), CToken(CToken.PUNC, ",", 228, (0, 0, 1)), CToken(CToken.SPACE, " ", 229, (0, 0, 1)), CToken(CToken.END, "}", 230, (0, 0, 0)), CToken(CToken.PUNC, ";", 231, (0, 0, 0))]

> 
> > [ ... skip 27 lines ... ]
> > +    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
> > +
> > +    # Dict to convert from string to an enum-like integer value.
> > +    _name_to_val = {k: v for v, k in _name_by_val.items()}
> > +
> > +    @staticmethod  
> 
> This stuff strikes me as a bit overdone; _name_to_val is really just the
> variable list for the class, right?

Those two vars are a kind of magic: they create two dictionaries:

- _name_by_val converts a token integer into a string;
- _name_to_val converts a string to an integer.

I opted to use this approach for a couple of reasons:

1. using tok.kind == "BEGIN" (and similar) everywhere is harder to
maintain, as python won't check for typos. Now, if one writes:
CToken.BEGHIN, an error will be raised;

2. the cost to convert from string to int is O(1), so not much
   a performance issue at the conversion;

3. using an integer on all checks should make the code faster as
   it doesn't require a loop to check the string.

> 
> > [ ... skip 30 lines ... ]
> > +               f"{self.brace_level}, {self.paren_level}, {self.bracket_level})"
> > +
> > +#: Tokens to parse C code.
> > +TOKEN_LIST = [
> > +    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
> > +  
> 
> So these aren't "tokens", this is a list of regexes; how is it intended
> to be used?

Right. I could have named it better, like RE_TOKEN_LIST, or
TOKEN_REGEX, as at the original example, which came from Python
documentation for "re" module:

	https://docs.python.org/3/library/re.html#writing-a-tokenizer

basically, we have the token type as the first element at the tuple,
and regex as the second one.

When regex matches, the CToken will be filed with kind=tuple[0].

the loop:
	for match in re.finditer(TOKEN_LIST, code):

will parse the entire C code source, in order, converting it into
a token list. So, a file like this:

	/**
	 * enum dmub_abm_ace_curve_type - ACE curve type.
	 */
	enum dmub_abm_ace_curve_type {
	        /**
	         * ACE curve as defined by the SW layer.
	         */
	        ABM_ACE_CURVE_TYPE__SW = 0,
	        /**
	         * ACE curve as defined by the SW to HW translation interface layer.
	         */
	        ABM_ACE_CURVE_TYPE__SW_IF = 1,
	};

will become (I used pprint here to better align the tokens):

	[CToken(CToken.ENUM, "enum", 0, (0, 0, 0)),
	 CToken(CToken.SPACE, " ", 4, (0, 0, 0)),
	 CToken(CToken.NAME, "dmub_abm_ace_curve_type", 5, (0, 0, 0)),
	 CToken(CToken.SPACE, " ", 28, (0, 0, 0)),
	 CToken(CToken.BEGIN, "{", 29, (0, 0, 1)),
	 CToken(CToken.SPACE, " ", 30, (0, 0, 1)),
	 CToken(CToken.COMMENT, "/**
	         * ACE curve as defined by the SW layer. */", 31, (0, 0, 1)),
	 CToken(CToken.SPACE, " ", 86, (0, 0, 1)),
	 CToken(CToken.NAME, "ABM_ACE_CURVE_TYPE__SW", 87, (0, 0, 1)),
	 CToken(CToken.SPACE, " ", 109, (0, 0, 1)),
	 CToken(CToken.OP, "=", 110, (0, 0, 1)),
	 CToken(CToken.SPACE, " ", 111, (0, 0, 1)),
	 CToken(CToken.NUMBER, "0", 112, (0, 0, 1)),
	 CToken(CToken.PUNC, ",", 113, (0, 0, 1)),
	 CToken(CToken.SPACE, " ", 114, (0, 0, 1)),
	 CToken(CToken.COMMENT, "/**
	         * ACE curve as defined by the SW to HW translation interface layer. */", 115, (0, 0, 1)),
	 CToken(CToken.SPACE, " ", 198, (0, 0, 1)),
	 CToken(CToken.NAME, "ABM_ACE_CURVE_TYPE__SW_IF", 199, (0, 0, 1)),
	 CToken(CToken.SPACE, " ", 224, (0, 0, 1)),
	 CToken(CToken.OP, "=", 225, (0, 0, 1)),
	 CToken(CToken.SPACE, " ", 226, (0, 0, 1)),
	 CToken(CToken.NUMBER, "1", 227, (0, 0, 1)),
	 CToken(CToken.PUNC, ",", 228, (0, 0, 1)),
	 CToken(CToken.SPACE, " ", 229, (0, 0, 1)),
	 CToken(CToken.END, "}", 230, (0, 0, 0)),
	 CToken(CToken.PUNC, ";", 231, (0, 0, 0))]

> 
> > +    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
> > +    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
> > +
> > +    (CToken.NUMBER,  r"0[xX][0-9a-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"  
> 
> How does "[\s\S]*" differ from plain old "*" ?

They are not identical, as "*" doesn't match "\n". As the tokenizer
also picks "\n" on several cases, like on comments, r"\s\S" works
better.

> > [ ... skip 15 lines ... ]
> > +    (CToken.STRUCT,  r"\bstruct\b"),
> > +    (CToken.UNION,   r"\bunion\b"),
> > +    (CToken.ENUM,    r"\benum\b"),
> > +    (CToken.TYPEDEF, r"\bkinddef\b"),
> > +
> > +    (CToken.NAME,      r"[A-Za-z_][A-Za-z0-9_]*"),  
> 

> "-" and "!" never need to be escaped.

"-" usually needs to be escaped, because it can be a range. I had
some troubles with the parser due to the lack of escapes, so I
ended being conservative.

( I'm finding a little bit hard to follow your comments...
  Here, for instance, "!" is not at CToken.NAME regex.
  Did b4 review place your comment at the wrong place? )

> 
> > +
> > +    (CToken.SPACE,   r"[\s]+"),
> > +
> > +    (CToken.MISMATCH,r"."),
> > +]
> > +  
> 
> "kinddef" ?

Should be "typedef".

This was due to a "sed s,type,kind," I applied to avoid using
"type" for the token type, as, when I started integrating it
with kdoc_re, it became confusing.

I'll fix at the next respin.

> 
> > +#: Handle C continuation lines.
> > +RE_CONT = KernRe(r"\\\n")
> > +
> > +RE_COMMENT_START = KernRe(r'/\*\s*')
> > +  
> 
> Don't need the [brackets] here

where?

> 
> > [ ... skip 6 lines ... ]
> > +
> > +    When converted to string, it drops comments and handle public/private
> > +    values, respecting depth.
> > +    """
> > +
> > +    # This class is inspired and follows the basic concepts of:  
> 
> That seems weird, why don't you just initialize it here?

Hard to tell what you're referring to. Maybe this:

	RE_SCANNER = fill_re_scanner(TOKEN_LIST)

The rationale is that I don't want to re-create this every time,
as this is const.

> 
> > [ ... skip 14 lines ... ]
> > +        source = RE_CONT.sub("", source)
> > +
> > +        brace_level = 0
> > +        paren_level = 0
> > +        bracket_level = 0
> > +  
> 
> Do you mean "iterator" here?

If you mean the typo at _tokenize() help text, yes:

	interactor -> iterator

> 
> > [ ... skip 33 lines ... ]
> > +        in this particular case, it makes sense, as we can pick the name
> > +        when matching a code via re_scanner().
> > +        """
> > +        global re_scanner
> > +
> > +        if not re_scanner:  
> 
> Putting __init__() first is fairly standard, methinks.

class CTokenizer __init__() module calls _tokenize() method on it.

My personal preference is to have the caller methods before the methods 
that actually call them, even inside a class, where the order doesn't
matter - or even in C, when we have an include with all prototypes.

But if you prefer, I can reorder it.

> > [ ... skip 15 lines ... ]
> > +
> > +        for tok in self.tokens:
> > +            if tok.kind == CToken.BEGIN:
> > +                show_stack.append(show_stack[-1])
> > +
> > +            elif tok.kind == CToken.END:  
> 
> I still don't understand why you do this here - this is all constant, right?

This one I didn't get to what part of the code you're referring to.

All constants on this code are using upper case names. They are:

- TOKEN_LIST (which should probably be named as TOKEN_REGEX_LIST).
- CToken enum-like names (BEGIN, END, OP, NAME, ...)
- three regexes (RE_COUNT, RE_COMMENT_START, RE_SCANNER)

See, what the tokenizer does is a linear transformation from a C
source string into a token list.

So, for each instance, its content will change. Also, when we apply
CMatch logic, its content will also change.

> 
> > +                prev = show_stack[-1]
> > +                if len(show_stack) > 1:
> > +                    show_stack.pop()
> > +
> > +                if not prev and show_stack[-1]:  
> 
> So you create a nice iterator structure, then just put it all together into a
> list anyway?

Not sure what you meant here.

The end result of the tokenizer is a list of tokens, in the order
they appear at the source code.

To be able to handle public/private and do code transforms, using it 
as a list is completely fine.

Now, if we want to use the tokenizer to parse things like:

	typedef struct ca_descr_info {
	        unsigned int num;
	        unsigned int type;
	} ca_descr_info_t;

Then having iterators to parse tokens on both directions
would be great, as the typedef identifier is at the end.

Thanks,
Mauro

Re: [PATCH v2 05/28] docs: kdoc_re: add a C tokenizer

Posted by Jonathan Corbet 3 weeks ago

On Thu, 12 Mar 2026 15:54:25 +0100, Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> Handling C code purely using regular expressions doesn't work well.
> 
> Add a C tokenizer to help doing it the right way.
> 
> The tokenizer was written using as basis the Python re documentation
> tokenizer example from:
> 	https://docs.python.org/3/library/re.html#writing-a-tokenizer
> 
> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> Message-ID: <c63ad36c81fe043e9e33ca55630414893f127413.1773074166.git.mchehab+huawei@kernel.org>
> Message-ID: <8541ffa469647db1a7154f274fb2d55b4c127dcb.1773326442.git.mchehab+huawei@kernel.org>

This is a combined effort to review this patch and to try out "b4 review",
we'll see how it goes :).

> diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
> index 085b89a4547c0..7bed4e9a88108 100644
> --- a/tools/lib/python/kdoc/kdoc_re.py
> +++ b/tools/lib/python/kdoc/kdoc_re.py
> @@ -141,6 +141,240 @@ class KernRe:
> [ ... skip 4 lines ... ]
> +
> +    @staticmethod
> +    def __str__(val):
> +        """Return the name of an enum value"""
> +        return TokType._name_by_val.get(val, f"UNKNOWN({val})")
> +

What is this class supposed to do?

> [ ... skip 27 lines ... ]
> +    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
> +
> +    # Dict to convert from string to an enum-like integer value.
> +    _name_to_val = {k: v for v, k in _name_by_val.items()}
> +
> +    @staticmethod

This stuff strikes me as a bit overdone; _name_to_val is really just the
variable list for the class, right?

> [ ... skip 30 lines ... ]
> +               f"{self.brace_level}, {self.paren_level}, {self.bracket_level})"
> +
> +#: Tokens to parse C code.
> +TOKEN_LIST = [
> +    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
> +

So these aren't "tokens", this is a list of regexes; how is it intended
to be used?

> +    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
> +    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
> +
> +    (CToken.NUMBER,  r"0[xX][0-9a-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"

How does "[\s\S]*" differ from plain old "*" ?

> [ ... skip 15 lines ... ]
> +    (CToken.STRUCT,  r"\bstruct\b"),
> +    (CToken.UNION,   r"\bunion\b"),
> +    (CToken.ENUM,    r"\benum\b"),
> +    (CToken.TYPEDEF, r"\bkinddef\b"),
> +
> +    (CToken.NAME,      r"[A-Za-z_][A-Za-z0-9_]*"),

"-" and "!" never need to be escaped.

> +
> +    (CToken.SPACE,   r"[\s]+"),
> +
> +    (CToken.MISMATCH,r"."),
> +]
> +

"kinddef" ?

> +#: Handle C continuation lines.
> +RE_CONT = KernRe(r"\\\n")
> +
> +RE_COMMENT_START = KernRe(r'/\*\s*')
> +

Don't need the [brackets] here

> [ ... skip 6 lines ... ]
> +
> +    When converted to string, it drops comments and handle public/private
> +    values, respecting depth.
> +    """
> +
> +    # This class is inspired and follows the basic concepts of:

That seems weird, why don't you just initialize it here?

> [ ... skip 14 lines ... ]
> +        source = RE_CONT.sub("", source)
> +
> +        brace_level = 0
> +        paren_level = 0
> +        bracket_level = 0
> +

Do you mean "iterator" here?

> [ ... skip 33 lines ... ]
> +        in this particular case, it makes sense, as we can pick the name
> +        when matching a code via re_scanner().
> +        """
> +        global re_scanner
> +
> +        if not re_scanner:

Putting __init__() first is fairly standard, methinks.

> [ ... skip 15 lines ... ]
> +
> +        for tok in self.tokens:
> +            if tok.kind == CToken.BEGIN:
> +                show_stack.append(show_stack[-1])
> +
> +            elif tok.kind == CToken.END:

I still don't understand why you do this here - this is all constant, right?

> +                prev = show_stack[-1]
> +                if len(show_stack) > 1:
> +                    show_stack.pop()
> +
> +                if not prev and show_stack[-1]:

So you create a nice iterator structure, then just put it all together into a
list anyway?

-- 
Jonathan Corbet <corbet@lwn.net>

Re: [PATCH v2 05/28] docs: kdoc_re: add a C tokenizer

Posted by Randy Dunlap 3 weeks ago

Uh, I find this review confusing.
Do your (Jon) comments refer to the code above them?
(more below)


On 3/16/26 4:03 PM, Jonathan Corbet wrote:
> On Thu, 12 Mar 2026 15:54:25 +0100, Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
>> Handling C code purely using regular expressions doesn't work well.
>>
>> Add a C tokenizer to help doing it the right way.
>>
>> The tokenizer was written using as basis the Python re documentation
>> tokenizer example from:
>> 	https://docs.python.org/3/library/re.html#writing-a-tokenizer
>>
>> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
>> Message-ID: <c63ad36c81fe043e9e33ca55630414893f127413.1773074166.git.mchehab+huawei@kernel.org>
>> Message-ID: <8541ffa469647db1a7154f274fb2d55b4c127dcb.1773326442.git.mchehab+huawei@kernel.org>
> 
> This is a combined effort to review this patch and to try out "b4 review",
> we'll see how it goes :).
> 
>> diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
>> index 085b89a4547c0..7bed4e9a88108 100644
>> --- a/tools/lib/python/kdoc/kdoc_re.py
>> +++ b/tools/lib/python/kdoc/kdoc_re.py
>> @@ -141,6 +141,240 @@ class KernRe:
>> [ ... skip 4 lines ... ]
>> +
>> +    @staticmethod
>> +    def __str__(val):
>> +        """Return the name of an enum value"""
>> +        return TokType._name_by_val.get(val, f"UNKNOWN({val})")
>> +
> 
> What is this class supposed to do?
> 
>> [ ... skip 27 lines ... ]
>> +    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
>> +
>> +    # Dict to convert from string to an enum-like integer value.
>> +    _name_to_val = {k: v for v, k in _name_by_val.items()}
>> +
>> +    @staticmethod
> 
> This stuff strikes me as a bit overdone; _name_to_val is really just the
> variable list for the class, right?
> 
>> [ ... skip 30 lines ... ]
>> +               f"{self.brace_level}, {self.paren_level}, {self.bracket_level})"
>> +
>> +#: Tokens to parse C code.
>> +TOKEN_LIST = [
>> +    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
>> +
> 
> So these aren't "tokens", this is a list of regexes; how is it intended
> to be used?
> 
>> +    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
>> +    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
>> +
>> +    (CToken.NUMBER,  r"0[xX][0-9a-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
> 
> How does "[\s\S]*" differ from plain old "*" ?
> 
>> [ ... skip 15 lines ... ]
>> +    (CToken.STRUCT,  r"\bstruct\b"),
>> +    (CToken.UNION,   r"\bunion\b"),
>> +    (CToken.ENUM,    r"\benum\b"),
>> +    (CToken.TYPEDEF, r"\bkinddef\b"),
>> +
>> +    (CToken.NAME,      r"[A-Za-z_][A-Za-z0-9_]*"),
> 
> "-" and "!" never need to be escaped.
> 
>> +
>> +    (CToken.SPACE,   r"[\s]+"),
>> +
>> +    (CToken.MISMATCH,r"."),
>> +]
>> +
> 
> "kinddef" ?

What does that refer to?

> 
>> +#: Handle C continuation lines.
>> +RE_CONT = KernRe(r"\\\n")
>> +
>> +RE_COMMENT_START = KernRe(r'/\*\s*')
>> +
> 
> Don't need the [brackets] here

what brackets?

> 
>> [ ... skip 6 lines ... ]
>> +
>> +    When converted to string, it drops comments and handle public/private
>> +    values, respecting depth.
>> +    """
>> +
>> +    # This class is inspired and follows the basic concepts of:
> 
> That seems weird, why don't you just initialize it here?

I can't tell what that comments refers to.

>> [ ... skip 14 lines ... ]
>> +        source = RE_CONT.sub("", source)
>> +
>> +        brace_level = 0
>> +        paren_level = 0
>> +        bracket_level = 0
>> +
> 
> Do you mean "iterator" here?

Ditto.

>> [ ... skip 33 lines ... ]
>> +        in this particular case, it makes sense, as we can pick the name
>> +        when matching a code via re_scanner().
>> +        """
>> +        global re_scanner
>> +
>> +        if not re_scanner:
> 
> Putting __init__() first is fairly standard, methinks.
> 
>> [ ... skip 15 lines ... ]
>> +
>> +        for tok in self.tokens:
>> +            if tok.kind == CToken.BEGIN:
>> +                show_stack.append(show_stack[-1])
>> +
>> +            elif tok.kind == CToken.END:
> 
> I still don't understand why you do this here - this is all constant, right?
> 
>> +                prev = show_stack[-1]
>> +                if len(show_stack) > 1:
>> +                    show_stack.pop()
>> +
>> +                if not prev and show_stack[-1]:
> 
> So you create a nice iterator structure, then just put it all together into a
> list anyway?
> 

-- 
~Randy

Re: [PATCH v2 05/28] docs: kdoc_re: add a C tokenizer

Posted by Mauro Carvalho Chehab 3 weeks ago

On Mon, 16 Mar 2026 16:29:37 -0700
Randy Dunlap <rdunlap@infradead.org> wrote:

> Uh, I find this review confusing.
> Do your (Jon) comments refer to the code above them?
> (more below)

I was about to comment the same thing: it sounds that b4 review did a
big mess with your comments, as it is very hard to identify what part
of the code you're referring to.

I'll reply to your comments on a separate e-mail - at least the ones I
understand.

> 
> 
> On 3/16/26 4:03 PM, Jonathan Corbet wrote:
> > On Thu, 12 Mar 2026 15:54:25 +0100, Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:  
> >> Handling C code purely using regular expressions doesn't work well.
> >>
> >> Add a C tokenizer to help doing it the right way.
> >>
> >> The tokenizer was written using as basis the Python re documentation
> >> tokenizer example from:
> >> 	https://docs.python.org/3/library/re.html#writing-a-tokenizer
> >>
> >> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> >> Message-ID: <c63ad36c81fe043e9e33ca55630414893f127413.1773074166.git.mchehab+huawei@kernel.org>
> >> Message-ID: <8541ffa469647db1a7154f274fb2d55b4c127dcb.1773326442.git.mchehab+huawei@kernel.org>  
> > 
> > This is a combined effort to review this patch and to try out "b4 review",
> > we'll see how it goes :).
> >   
> >> diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
> >> index 085b89a4547c0..7bed4e9a88108 100644
> >> --- a/tools/lib/python/kdoc/kdoc_re.py
> >> +++ b/tools/lib/python/kdoc/kdoc_re.py
> >> @@ -141,6 +141,240 @@ class KernRe:
> >> [ ... skip 4 lines ... ]
> >> +
> >> +    @staticmethod
> >> +    def __str__(val):
> >> +        """Return the name of an enum value"""
> >> +        return TokType._name_by_val.get(val, f"UNKNOWN({val})")
> >> +  
> > 
> > What is this class supposed to do?
> >   
> >> [ ... skip 27 lines ... ]
> >> +    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
> >> +
> >> +    # Dict to convert from string to an enum-like integer value.
> >> +    _name_to_val = {k: v for v, k in _name_by_val.items()}
> >> +
> >> +    @staticmethod  
> > 
> > This stuff strikes me as a bit overdone; _name_to_val is really just the
> > variable list for the class, right?
> >   
> >> [ ... skip 30 lines ... ]
> >> +               f"{self.brace_level}, {self.paren_level}, {self.bracket_level})"
> >> +
> >> +#: Tokens to parse C code.
> >> +TOKEN_LIST = [
> >> +    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
> >> +  
> > 
> > So these aren't "tokens", this is a list of regexes; how is it intended
> > to be used?
> >   
> >> +    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
> >> +    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
> >> +
> >> +    (CToken.NUMBER,  r"0[xX][0-9a-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"  
> > 
> > How does "[\s\S]*" differ from plain old "*" ?
> >   
> >> [ ... skip 15 lines ... ]
> >> +    (CToken.STRUCT,  r"\bstruct\b"),
> >> +    (CToken.UNION,   r"\bunion\b"),
> >> +    (CToken.ENUM,    r"\benum\b"),
> >> +    (CToken.TYPEDEF, r"\bkinddef\b"),
> >> +
> >> +    (CToken.NAME,      r"[A-Za-z_][A-Za-z0-9_]*"),  
> > 
> > "-" and "!" never need to be escaped.
> >   
> >> +
> >> +    (CToken.SPACE,   r"[\s]+"),
> >> +
> >> +    (CToken.MISMATCH,r"."),
> >> +]
> >> +  
> > 
> > "kinddef" ?  
> 
> What does that refer to?
> 
> >   
> >> +#: Handle C continuation lines.
> >> +RE_CONT = KernRe(r"\\\n")
> >> +
> >> +RE_COMMENT_START = KernRe(r'/\*\s*')
> >> +  
> > 
> > Don't need the [brackets] here  
> 
> what brackets?
> 
> >   
> >> [ ... skip 6 lines ... ]
> >> +
> >> +    When converted to string, it drops comments and handle public/private
> >> +    values, respecting depth.
> >> +    """
> >> +
> >> +    # This class is inspired and follows the basic concepts of:  
> > 
> > That seems weird, why don't you just initialize it here?  
> 
> I can't tell what that comments refers to.
> 
> >> [ ... skip 14 lines ... ]
> >> +        source = RE_CONT.sub("", source)
> >> +
> >> +        brace_level = 0
> >> +        paren_level = 0
> >> +        bracket_level = 0
> >> +  
> > 
> > Do you mean "iterator" here?  
> 
> Ditto.
> 
> >> [ ... skip 33 lines ... ]
> >> +        in this particular case, it makes sense, as we can pick the name
> >> +        when matching a code via re_scanner().
> >> +        """
> >> +        global re_scanner
> >> +
> >> +        if not re_scanner:  
> > 
> > Putting __init__() first is fairly standard, methinks.
> >   
> >> [ ... skip 15 lines ... ]
> >> +
> >> +        for tok in self.tokens:
> >> +            if tok.kind == CToken.BEGIN:
> >> +                show_stack.append(show_stack[-1])
> >> +
> >> +            elif tok.kind == CToken.END:  
> > 
> > I still don't understand why you do this here - this is all constant, right?
> >   
> >> +                prev = show_stack[-1]
> >> +                if len(show_stack) > 1:
> >> +                    show_stack.pop()
> >> +
> >> +                if not prev and show_stack[-1]:  
> > 
> > So you create a nice iterator structure, then just put it all together into a
> > list anyway?
> >   
> 



Thanks,
Mauro

Re: [PATCH v2 05/28] docs: kdoc_re: add a C tokenizer

Posted by Jonathan Corbet 3 weeks ago

Randy Dunlap <rdunlap@infradead.org> writes:

> Uh, I find this review confusing.
> Do your (Jon) comments refer to the code above them?
> (more below)

They do

Or, at least, they did...but they clearly got mixed up in the sending
somewhere.  Below is the intended version...

> tools/lib/python/kdoc/kdoc_re.py | 234 +++++++++++++++++++++++++++++++
>  1 file changed, 234 insertions(+)
> 
> diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
> index 085b89a4547c..7bed4e9a8810 100644
> --- a/tools/lib/python/kdoc/kdoc_re.py
> +++ b/tools/lib/python/kdoc/kdoc_re.py
> @@ -141,6 +141,240 @@ class KernRe:
> 
> 	 return self.last_match.groups()
> 
> +class TokType():
> +
> +    @staticmethod
> +    def __str__(val):
> +        ""Return the name of an enum value""
> +        return TokType._name_by_val.get(val, f"UNKNOWN({val})")

What is this class supposed to do?

> +
> +class CToken():
> +    ""
> +    Data class to define a C token.
> +    ""
> +
> +    # Tokens that can be used by the parser. Works like an C enum.
> +
> +    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
> +    STRING = 1      #: A string, including quotation marks.
> +    CHAR = 2        #: A character, including apostophes.
> +    NUMBER = 3      #: A number.
> +    PUNC = 4        #: A puntuation mark: ``;`` / ``,`` / ``.``.
> +    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
> +    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
> +    CPP = 7         #: A preprocessor macro.
> +    HASH = 8        #: The hash character - useful to handle other macros.
> +    OP = 9          #: A C operator (add, subtract, ...).
> +    STRUCT = 10     #: A ``struct`` keyword.
> +    UNION = 11      #: An ``union`` keyword.
> +    ENUM = 12       #: A ``struct`` keyword.
> +    TYPEDEF = 13    #: A ``typedef`` keyword.
> +    NAME = 14       #: A name. Can be an ID or a type.
> +    SPACE = 15      #: Any space characters, including new lines
> +
> +    MISMATCH = 255  #: an error indicator: should never happen in practice.
> +
> +    # Dict to convert from an enum interger into a string.
> +    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
> +
> +    # Dict to convert from string to an enum-like integer value.
> +    _name_to_val = {k: v for v, k in _name_by_val.items()}

This stuff strikes me as a bit overdone; _name_to_val is really just the
variable list for the class, right?

> +
> +    @staticmethod
> +    def to_name(val):
> +        ""Convert from an integer value from CToken enum into a string""
> +
> +        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
> +
> +    @staticmethod
> +    def from_name(name):
> +        ""Convert a string into a CToken enum value""
> +        if name in CToken._name_to_val:
> +            return CToken._name_to_val[name]
> +
> +        return CToken.MISMATCH
> +
> +    def __init__(self, kind, value, pos,
> +                 brace_level, paren_level, bracket_level):
> +        self.kind = kind
> +        self.value = value
> +        self.pos = pos
> +        self.brace_level = brace_level
> +        self.paren_level = paren_level
> +        self.bracket_level = bracket_level
> +
> +    def __repr__(self):
> +        name = self.to_name(self.kind)
> +        if isinstance(self.value, str):
> +            value = '"' + self.value + '"'
> +        else:
> +            value = self.value
> +
> +        return f"CToken({name}, {value}, {self.pos}, " \
> +               f"{self.brace_level}, {self.paren_level}, {self.bracket_level})"
> +
> +#: Tokens to parse C code.
> +TOKEN_LIST = [

So these aren't "tokens", this is a list of regexes; how is it intended
to be used?

> +    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),

How does "[\s\S]*" differ from plain old "*" ?

> +
> +    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
> +    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
> +
> +    (CToken.NUMBER,  r"0[xX][0-9a-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
> +                     r"[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?[fFlL]*"),
> +
> +    (CToken.PUNC,    r"[;,\.]"),
> +
> +    (CToken.BEGIN,   r"[\[\(\{]"),
> +
> +    (CToken.END,     r"[\]\)\}]"),
> +
> +    (CToken.CPP,     r"#\s*(define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
> +
> +    (CToken.HASH,    r"#"),
> +
> +    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
> +                     r"|&=|\|=|\^=|=|\+|\-|\*|/|%|<|>|&|\||\^|~|!|\?|\:"),

"-" and "!" never need to be escaped.

> +
> +    (CToken.STRUCT,  r"\bstruct\b"),
> +    (CToken.UNION,   r"\bunion\b"),
> +    (CToken.ENUM,    r"\benum\b"),
> +    (CToken.TYPEDEF, r"\bkinddef\b"),

"kinddef" ?

> +
> +    (CToken.NAME,      r"[A-Za-z_][A-Za-z0-9_]*"),
> +
> +    (CToken.SPACE,   r"[\s]+"),

Don't need the [brackets] here

> +
> +    (CToken.MISMATCH,r"."),
> +]
> +
> +#: Handle C continuation lines.
> +RE_CONT = KernRe(r"\\\n")
> +
> +RE_COMMENT_START = KernRe(r'/\*\s*')
> +
> +#: tokenizer regex. Will be filled at the first CTokenizer usage.
> +re_scanner = None

That seems weird, why don't you just initialize it here?

> +
> +class CTokenizer():
> +    ""
> +    Scan C statements and definitions and produce tokens.
> +
> +    When converted to string, it drops comments and handle public/private
> +    values, respecting depth.
> +    ""
> +
> +    # This class is inspired and follows the basic concepts of:
> +    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
> +
> +    def _tokenize(self, source):
> +        ""
> +        Interactor that parses ``source``, splitting it into tokens, as defined
> +        at ``self.TOKEN_LIST``.
> +
> +        The interactor returns a CToken class object.
> +        ""

Do you mean "iterator" here?

> +
> +        # Handle continuation lines. Note that kdoc_parser already has a
> +        # logic to do that. Still, let's keep it for completeness, as we might
> +        # end re-using this tokenizer outsize kernel-doc some day - or we may
> +        # eventually remove from there as a future cleanup.
> +        source = RE_CONT.sub(", source)
> +
> +        brace_level = 0
> +        paren_level = 0
> +        bracket_level = 0
> +
> +        for match in re_scanner.finditer(source):
> +            kind = CToken.from_name(match.lastgroup)
> +            pos = match.start()
> +            value = match.group()
> +
> +            if kind == CToken.MISMATCH:
> +                raise RuntimeError(f"Unexpected token '{value}' on {pos}:\n\t{source}")
> +            elif kind == CToken.BEGIN:
> +                if value == '(':
> +                    paren_level += 1
> +                elif value == '[':
> +                    bracket_level += 1
> +                else:  # value == '{'
> +                    brace_level += 1
> +
> +            elif kind == CToken.END:
> +                if value == ')' and paren_level > 0:
> +                    paren_level -= 1
> +                elif value == ']' and bracket_level > 0:
> +                    bracket_level -= 1
> +                elif brace_level > 0:    # value == '}'
> +                    brace_level -= 1
> +
> +            yield CToken(kind, value, pos,
> +                         brace_level, paren_level, bracket_level)
> +
> +    def __init__(self, source):

Putting __init__() first is fairly standard, methinks.

> +        ""
> +        Create a regular expression to handle TOKEN_LIST.
> +
> +        While I generally don't like using regex group naming via:
> +            (?P<name>...)
> +
> +        in this particular case, it makes sense, as we can pick the name
> +        when matching a code via re_scanner().
> +        ""
> +        global re_scanner
> +
> +        if not re_scanner:
> +            re_tokens = []
> +
> +            for kind, pattern in TOKEN_LIST:
> +                name = CToken.to_name(kind)
> +                re_tokens.append(f"(?P<{name}>{pattern})")
> +
> +            re_scanner = KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)

I still don't understand why you do this here - this is all constant, right?

> +
> +        self.tokens = []
> +        for tok in self._tokenize(source):
> +            self.tokens.append(tok)

So you create a nice iterator structure, then just put it all together into a
list anyway?

> +
> +    def __str__(self):
> +        out="
> +        show_stack = [True]
> +
> +        for tok in self.tokens:
> +            if tok.kind == CToken.BEGIN:
> +                show_stack.append(show_stack[-1])
> +
> +            elif tok.kind == CToken.END:
> +                prev = show_stack[-1]
> +                if len(show_stack) > 1:
> +                    show_stack.pop()
> +
> +                if not prev and show_stack[-1]:
> +                    #
> +                    # Try to preserve indent
> +                    #
> +                    out += "\t" * (len(show_stack) - 1)
> +
> +                    out += str(tok.value)
> +                    continue
> +
> +            elif tok.kind == CToken.COMMENT:
> +                comment = RE_COMMENT_START.sub(", tok.value)
> +
> +                if comment.startswith("private:"):
> +                    show_stack[-1] = False
> +                    show = False
> +                elif comment.startswith("public:"):
> +                    show_stack[-1] = True
> +
> +                continue
> +
> +            if show_stack[-1]:
> +                    out += str(tok.value)
> +
> +        return out
> +
> +
>  #: Nested delimited pairs (brackets and parenthesis)
>  DELIMITER_PAIRS = {
>      '{': '}',

Thanks,

jon

Re: [PATCH v2 05/28] docs: kdoc_re: add a C tokenizer

Posted by Mauro Carvalho Chehab 3 weeks ago

On Mon, 16 Mar 2026 17:40:22 -0600
Jonathan Corbet <corbet@lwn.net> wrote:

> Randy Dunlap <rdunlap@infradead.org> writes:
> 
> > Uh, I find this review confusing.
> > Do your (Jon) comments refer to the code above them?
> > (more below)  
> 
> They do
> 
> Or, at least, they did...but they clearly got mixed up in the sending
> somewhere.  Below is the intended version...

Oh, I should have read this one before... Ignore my previous comment.
I'll move the answers to this reply, and answer the other ones.

> > tools/lib/python/kdoc/kdoc_re.py | 234 +++++++++++++++++++++++++++++++
> >  1 file changed, 234 insertions(+)
> > 
> > diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
> > index 085b89a4547c..7bed4e9a8810 100644
> > --- a/tools/lib/python/kdoc/kdoc_re.py
> > +++ b/tools/lib/python/kdoc/kdoc_re.py
> > @@ -141,6 +141,240 @@ class KernRe:
> > 
> > 	 return self.last_match.groups()
> > 
> > +class TokType():
> > +
> > +    @staticmethod
> > +    def __str__(val):
> > +        ""Return the name of an enum value""
> > +        return TokType._name_by_val.get(val, f"UNKNOWN({val})")  
> 
> What is this class supposed to do?

This __str__() method ensures that, when printing a CToken object,
the name will be displayed, instead of a number. This is really
useful when debugging.

See, if I add a print:

<snip>
--- a/tools/lib/python/kdoc/kdoc_parser.py
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -87,6 +87,7 @@ def trim_private_members(text):
     """
 
     tokens = CTokenizer(text)
+    print(tokens.tokens)
     return str(tokens)
 
</snip>

the tokens will appear as names at the output:

$ ./scripts/kernel-doc -none er.c
[CToken(CToken.ENUM, "enum", 0, (0, 0, 0)), CToken(CToken.SPACE, " ", 4, (0, 0, 0)), CToken(CToken.NAME, "dmub_abm_ace_curve_type", 5, (0, 0, 0)), CToken(CToken.SPACE, " ", 28, (0, 0, 0)), CToken(CToken.BEGIN, "{", 29, (0, 0, 1)), CToken(CToken.SPACE, " ", 30, (0, 0, 1)), CToken(CToken.COMMENT, "/**
         * ACE curve as defined by the SW layer. */", 31, (0, 0, 1)), CToken(CToken.SPACE, " ", 86, (0, 0, 1)), CToken(CToken.NAME, "ABM_ACE_CURVE_TYPE__SW", 87, (0, 0, 1)), CToken(CToken.SPACE, " ", 109, (0, 0, 1)), CToken(CToken.OP, "=", 110, (0, 0, 1)), CToken(CToken.SPACE, " ", 111, (0, 0, 1)), CToken(CToken.NUMBER, "0", 112, (0, 0, 1)), CToken(CToken.PUNC, ",", 113, (0, 0, 1)), CToken(CToken.SPACE, " ", 114, (0, 0, 1)), CToken(CToken.COMMENT, "/**
         * ACE curve as defined by the SW to HW translation interface layer. */", 115, (0, 0, 1)), CToken(CToken.SPACE, " ", 198, (0, 0, 1)), CToken(CToken.NAME, "ABM_ACE_CURVE_TYPE__SW_IF", 199, (0, 0, 1)), CToken(CToken.SPACE, " ", 224, (0, 0, 1)), CToken(CToken.OP, "=", 225, (0, 0, 1)), CToken(CToken.SPACE, " ", 226, (0, 0, 1)), CToken(CToken.NUMBER, "1", 227, (0, 0, 1)), CToken(CToken.PUNC, ",", 228, (0, 0, 1)), CToken(CToken.SPACE, " ", 229, (0, 0, 1)), CToken(CToken.END, "}", 230, (0, 0, 0)), CToken(CToken.PUNC, ";", 231, (0, 0, 0))]

> 
> > +
> > +class CToken():
> > +    ""
> > +    Data class to define a C token.
> > +    ""
> > +
> > +    # Tokens that can be used by the parser. Works like an C enum.
> > +
> > +    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
> > +    STRING = 1      #: A string, including quotation marks.
> > +    CHAR = 2        #: A character, including apostophes.
> > +    NUMBER = 3      #: A number.
> > +    PUNC = 4        #: A puntuation mark: ``;`` / ``,`` / ``.``.
> > +    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
> > +    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
> > +    CPP = 7         #: A preprocessor macro.
> > +    HASH = 8        #: The hash character - useful to handle other macros.
> > +    OP = 9          #: A C operator (add, subtract, ...).
> > +    STRUCT = 10     #: A ``struct`` keyword.
> > +    UNION = 11      #: An ``union`` keyword.
> > +    ENUM = 12       #: A ``struct`` keyword.
> > +    TYPEDEF = 13    #: A ``typedef`` keyword.
> > +    NAME = 14       #: A name. Can be an ID or a type.
> > +    SPACE = 15      #: Any space characters, including new lines
> > +
> > +    MISMATCH = 255  #: an error indicator: should never happen in practice.
> > +
> > +    # Dict to convert from an enum interger into a string.
> > +    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
> > +
> > +    # Dict to convert from string to an enum-like integer value.
> > +    _name_to_val = {k: v for v, k in _name_by_val.items()}  
> 
> This stuff strikes me as a bit overdone; _name_to_val is really just the
> variable list for the class, right?

Those two vars are a kind of magic: they create two dictionaries:

- _name_by_val converts a token integer into a string;
- _name_to_val converts a string to an integer.

I opted to use this approach for a couple of reasons:

1. using tok.kind == "BEGIN" (and similar) everywhere is harder to
maintain, as python won't check for typos. Now, if one writes:
CToken.BEGHIN, an error will be raised;

2. the cost to convert from string to int is O(1), so not much
   a performance issue at the conversion;

3. using an integer on all checks should make the code faster as
   it doesn't require a loop to check the string.

> 
> > +
> > +    @staticmethod
> > +    def to_name(val):
> > +        ""Convert from an integer value from CToken enum into a string""
> > +
> > +        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
> > +
> > +    @staticmethod
> > +    def from_name(name):
> > +        ""Convert a string into a CToken enum value""
> > +        if name in CToken._name_to_val:
> > +            return CToken._name_to_val[name]
> > +
> > +        return CToken.MISMATCH
> > +
> > +    def __init__(self, kind, value, pos,
> > +                 brace_level, paren_level, bracket_level):
> > +        self.kind = kind
> > +        self.value = value
> > +        self.pos = pos
> > +        self.brace_level = brace_level
> > +        self.paren_level = paren_level
> > +        self.bracket_level = bracket_level
> > +
> > +    def __repr__(self):
> > +        name = self.to_name(self.kind)
> > +        if isinstance(self.value, str):
> > +            value = '"' + self.value + '"'
> > +        else:
> > +            value = self.value
> > +
> > +        return f"CToken({name}, {value}, {self.pos}, " \
> > +               f"{self.brace_level}, {self.paren_level}, {self.bracket_level})"
> > +
> > +#: Tokens to parse C code.
> > +TOKEN_LIST = [  
> 
> So these aren't "tokens", this is a list of regexes; how is it intended
> to be used?
> 
> > +    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),  
> 
> How does "[\s\S]*" differ from plain old "*" ?

They are not identical, as "*" doesn't match "\n". As the tokenizer
also picks "\n" on several cases, like on comments, r"\s\S" works
better.

> 
> > +
> > +    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
> > +    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
> > +
> > +    (CToken.NUMBER,  r"0[xX][0-9a-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
> > +                     r"[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?[fFlL]*"),
> > +
> > +    (CToken.PUNC,    r"[;,\.]"),
> > +
> > +    (CToken.BEGIN,   r"[\[\(\{]"),
> > +
> > +    (CToken.END,     r"[\]\)\}]"),
> > +
> > +    (CToken.CPP,     r"#\s*(define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
> > +
> > +    (CToken.HASH,    r"#"),
> > +
> > +    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
> > +                     r"|&=|\|=|\^=|=|\+|\-|\*|/|%|<|>|&|\||\^|~|!|\?|\:"),  
> 
> "-" and "!" never need to be escaped.

"-" usually needs to be escaped, because it can be a range. I actually
tried without escaping it, but the regex failed. So I ended being 
conservative. 

> 
> > +
> > +    (CToken.STRUCT,  r"\bstruct\b"),
> > +    (CToken.UNION,   r"\bunion\b"),
> > +    (CToken.ENUM,    r"\benum\b"),
> > +    (CToken.TYPEDEF, r"\bkinddef\b"),  
> 
> "kinddef" ?

Should be "typedef".

This was due to a "sed s,type,kind," I applied to avoid using
"type" for the token type, as, when I started integrating it
with kdoc_re, it became confusing.

I'll fix at the next respin.


> 
> > +
> > +    (CToken.NAME,      r"[A-Za-z_][A-Za-z0-9_]*"),
> > +
> > +    (CToken.SPACE,   r"[\s]+"),  
> 
> Don't need the [brackets] here

True. This was [ \t] and there as a separate token for new line.
I merged them, but forgot stripping the brackets.

Will cleanup at the next respin.

> 
> > +
> > +    (CToken.MISMATCH,r"."),
> > +]
> > +
> > +#: Handle C continuation lines.
> > +RE_CONT = KernRe(r"\\\n")
> > +
> > +RE_COMMENT_START = KernRe(r'/\*\s*')
> > +
> > +#: tokenizer regex. Will be filled at the first CTokenizer usage.
> > +re_scanner = None  
> 
> That seems weird, why don't you just initialize it here?

Yeah, I changed this one to:

	def fill_re_scanner(token_list):
	    """Ancillary routine to convert TOKEN_LIST into a finditer regex"""
	    re_tokens = []

	    for kind, pattern in token_list:
	        name = CToken.to_name(kind)
	        re_tokens.append(f"(?P<{name}>{pattern})")

	    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)

	RE_SCANNER = fill_re_scanner(TOKEN_LIST)

but I guess tis is on a patch later on.

> 
> > +
> > +class CTokenizer():
> > +    ""
> > +    Scan C statements and definitions and produce tokens.
> > +
> > +    When converted to string, it drops comments and handle public/private
> > +    values, respecting depth.
> > +    ""
> > +
> > +    # This class is inspired and follows the basic concepts of:
> > +    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
> > +
> > +    def _tokenize(self, source):
> > +        ""
> > +        Interactor that parses ``source``, splitting it into tokens, as defined
> > +        at ``self.TOKEN_LIST``.
> > +
> > +        The interactor returns a CToken class object.
> > +        ""  
> 
> Do you mean "iterator" here?

Yes. will fix at the next respin.

> 
> > +
> > +        # Handle continuation lines. Note that kdoc_parser already has a
> > +        # logic to do that. Still, let's keep it for completeness, as we might
> > +        # end re-using this tokenizer outsize kernel-doc some day - or we may
> > +        # eventually remove from there as a future cleanup.
> > +        source = RE_CONT.sub(", source)
> > +
> > +        brace_level = 0
> > +        paren_level = 0
> > +        bracket_level = 0
> > +
> > +        for match in re_scanner.finditer(source):
> > +            kind = CToken.from_name(match.lastgroup)
> > +            pos = match.start()
> > +            value = match.group()
> > +
> > +            if kind == CToken.MISMATCH:
> > +                raise RuntimeError(f"Unexpected token '{value}' on {pos}:\n\t{source}")
> > +            elif kind == CToken.BEGIN:
> > +                if value == '(':
> > +                    paren_level += 1
> > +                elif value == '[':
> > +                    bracket_level += 1
> > +                else:  # value == '{'
> > +                    brace_level += 1
> > +
> > +            elif kind == CToken.END:
> > +                if value == ')' and paren_level > 0:
> > +                    paren_level -= 1
> > +                elif value == ']' and bracket_level > 0:
> > +                    bracket_level -= 1
> > +                elif brace_level > 0:    # value == '}'
> > +                    brace_level -= 1
> > +
> > +            yield CToken(kind, value, pos,
> > +                         brace_level, paren_level, bracket_level)
> > +
> > +    def __init__(self, source):  
> 
> Putting __init__() first is fairly standard, methinks.

Yes, but __init__ calls _tokenize().

My personal preference is to have the caller methods before the methods 
that actually call them, even inside a class, where the order doesn't
matter - or even in C, when we have an include with all prototypes.

But if you prefer, I can reorder it.

> 
> > +        ""
> > +        Create a regular expression to handle TOKEN_LIST.
> > +
> > +        While I generally don't like using regex group naming via:
> > +            (?P<name>...)
> > +
> > +        in this particular case, it makes sense, as we can pick the name
> > +        when matching a code via re_scanner().
> > +        ""
> > +        global re_scanner
> > +
> > +        if not re_scanner:
> > +            re_tokens = []
> > +
> > +            for kind, pattern in TOKEN_LIST:
> > +                name = CToken.to_name(kind)
> > +                re_tokens.append(f"(?P<{name}>{pattern})")
> > +
> > +            re_scanner = KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)  
> 
> I still don't understand why you do this here - this is all constant, right?

Yes. See above. I moved this logic to a function and called it during
module init time, for it to happen just once.

> 
> > +
> > +        self.tokens = []
> > +        for tok in self._tokenize(source):
> > +            self.tokens.append(tok)  
> 
> So you create a nice iterator structure, then just put it all together into a
> list anyway?

We could have used yield here, but what's the point? Due to C 
transforms, we'll need to navigate on all tokens multiple times. 

Having them on a list ends saving time, as we only need to 
tokenize once per source code.



> 
> > +
> > +    def __str__(self):
> > +        out="
> > +        show_stack = [True]
> > +
> > +        for tok in self.tokens:
> > +            if tok.kind == CToken.BEGIN:
> > +                show_stack.append(show_stack[-1])
> > +
> > +            elif tok.kind == CToken.END:
> > +                prev = show_stack[-1]
> > +                if len(show_stack) > 1:
> > +                    show_stack.pop()
> > +
> > +                if not prev and show_stack[-1]:
> > +                    #
> > +                    # Try to preserve indent
> > +                    #
> > +                    out += "\t" * (len(show_stack) - 1)
> > +
> > +                    out += str(tok.value)
> > +                    continue
> > +
> > +            elif tok.kind == CToken.COMMENT:
> > +                comment = RE_COMMENT_START.sub(", tok.value)
> > +
> > +                if comment.startswith("private:"):
> > +                    show_stack[-1] = False
> > +                    show = False
> > +                elif comment.startswith("public:"):
> > +                    show_stack[-1] = True
> > +
> > +                continue
> > +
> > +            if show_stack[-1]:
> > +                    out += str(tok.value)
> > +
> > +        return out
> > +
> > +
> >  #: Nested delimited pairs (brackets and parenthesis)
> >  DELIMITER_PAIRS = {
> >      '{': '}',  
> 
> Thanks,
> 
> jon
> 



Thanks,
Mauro

Re: [PATCH v2 05/28] docs: kdoc_re: add a C tokenizer

Posted by Jonathan Corbet 2 weeks, 6 days ago

Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes:

>> > tools/lib/python/kdoc/kdoc_re.py | 234 +++++++++++++++++++++++++++++++
>> >  1 file changed, 234 insertions(+)
>> > 
>> > diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
>> > index 085b89a4547c..7bed4e9a8810 100644
>> > --- a/tools/lib/python/kdoc/kdoc_re.py
>> > +++ b/tools/lib/python/kdoc/kdoc_re.py
>> > @@ -141,6 +141,240 @@ class KernRe:
>> > 
>> > 	 return self.last_match.groups()
>> > 
>> > +class TokType():
>> > +
>> > +    @staticmethod
>> > +    def __str__(val):
>> > +        ""Return the name of an enum value""
>> > +        return TokType._name_by_val.get(val, f"UNKNOWN({val})")  
>> 
>> What is this class supposed to do?
>
> This __str__() method ensures that, when printing a CToken object,
> the name will be displayed, instead of a number. This is really
> useful when debugging.

I was talking about the TokType class, though, not CToken.  This class
doesn't appear to be used anywhere.  Indeed, I notice now that when you
relocate CToken in patch 7, TokType is silently removed.  So perhaps
it's better not to introduce it in the first place :)

jon