[v2] kernel-doc: use a C lexical tokenizer for transforms

[PATCH v2 29/28] docs: kdoc: ensure that comments are dropped before calling split_struct_proto()

Posted by Mauro Carvalho Chehab 3 weeks, 4 days ago

Changeset 2b957decdb6c ("docs: kdoc: don't add broken comments inside prototypes")
revealed a hidden bug at split_struct_proto(): some comments there may break
its capability of properly identifying a struct.

Fixing it is as simple as stripping comments before calling it.

Fixes: 2b957decdb6c ("docs: kdoc: don't add broken comments inside prototypes")
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_parser.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py
index 3ff17b07c1c9..ed378edb1e05 100644
--- a/tools/lib/python/kdoc/kdoc_parser.py
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -724,6 +724,7 @@ class KernelDoc:
         #
         # Do the basic parse to get the pieces of the declaration.
         #
+        proto = trim_private_members(proto)
         struct_parts = self.split_struct_proto(proto)
         if not struct_parts:
             self.emit_msg(ln, f"{proto} error: Cannot parse struct or union!")
@@ -764,6 +765,7 @@ class KernelDoc:
         # Strip preprocessor directives.  Note that this depends on the
         # trailing semicolon we added in process_proto_type().
         #
+        proto = trim_private_members(proto)
         proto = KernRe(r'#\s*((define|ifdef|if)\s+|endif)[^;]*;', flags=re.S).sub('', proto)
         #
         # Parse out the name and members of the enum.  Typedef form first.
@@ -771,7 +773,7 @@ class KernelDoc:
         r = KernRe(r'typedef\s+enum\s*\{(.*)\}\s*(\w*)\s*;')
         if r.search(proto):
             declaration_name = r.group(2)
-            members = trim_private_members(r.group(1))
+            members = r.group(1)
         #
         # Failing that, look for a straight enum
         #
@@ -779,7 +781,7 @@ class KernelDoc:
             r = KernRe(r'enum\s+(\w*)\s*\{(.*)\}')
             if r.match(proto):
                 declaration_name = r.group(1)
-                members = trim_private_members(r.group(2))
+                members = r.group(2)
         #
         # OK, this isn't going to work.
         #
-- 
2.53.0

RE: [PATCH v2 29/28] docs: kdoc: ensure that comments are dropped before calling split_struct_proto()

Posted by Loktionov, Aleksandr 3 weeks, 4 days ago


> -----Original Message-----
> From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> Sent: Friday, March 13, 2026 9:34 AM
> To: Jonathan Corbet <corbet@lwn.net>; Linux Doc Mailing List <linux-
> doc@vger.kernel.org>
> Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>; linux-
> kernel@vger.kernel.org; Loktionov, Aleksandr
> <aleksandr.loktionov@intel.com>; Mauro Carvalho Chehab
> <mchehab@kernel.org>; Randy Dunlap <rdunlap@infradead.org>
> Subject: [PATCH v2 29/28] docs: kdoc: ensure that comments are dropped
> before calling split_struct_proto()
> 
> Changeset 2b957decdb6c ("docs: kdoc: don't add broken comments inside
> prototypes") revealed a hidden bug at split_struct_proto(): some
> comments there may break its capability of properly identifying a
> struct.
> 
> Fixing it is as simple as stripping comments before calling it.
> 
> Fixes: 2b957decdb6c ("docs: kdoc: don't add broken comments inside
> prototypes")
> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> ---
>  tools/lib/python/kdoc/kdoc_parser.py | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/lib/python/kdoc/kdoc_parser.py
> b/tools/lib/python/kdoc/kdoc_parser.py
> index 3ff17b07c1c9..ed378edb1e05 100644
> --- a/tools/lib/python/kdoc/kdoc_parser.py
> +++ b/tools/lib/python/kdoc/kdoc_parser.py
> @@ -724,6 +724,7 @@ class KernelDoc:
>          #
>          # Do the basic parse to get the pieces of the declaration.
>          #
> +        proto = trim_private_members(proto)
>          struct_parts = self.split_struct_proto(proto)
>          if not struct_parts:
>              self.emit_msg(ln, f"{proto} error: Cannot parse struct or
> union!") @@ -764,6 +765,7 @@ class KernelDoc:
>          # Strip preprocessor directives.  Note that this depends on
> the
>          # trailing semicolon we added in process_proto_type().
>          #
> +        proto = trim_private_members(proto)
>          proto = KernRe(r'#\s*((define|ifdef|if)\s+|endif)[^;]*;',
> flags=re.S).sub('', proto)
>          #
>          # Parse out the name and members of the enum.  Typedef form
> first.
> @@ -771,7 +773,7 @@ class KernelDoc:
>          r = KernRe(r'typedef\s+enum\s*\{(.*)\}\s*(\w*)\s*;')
>          if r.search(proto):
>              declaration_name = r.group(2)
> -            members = trim_private_members(r.group(1))
> +            members = r.group(1)
>          #
>          # Failing that, look for a straight enum
>          #
> @@ -779,7 +781,7 @@ class KernelDoc:
>              r = KernRe(r'enum\s+(\w*)\s*\{(.*)\}')
>              if r.match(proto):
>                  declaration_name = r.group(1)
> -                members = trim_private_members(r.group(2))
> +                members = r.group(2)
>          #
>          # OK, this isn't going to work.
>          #
> --
> 2.53.0

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>

[PATCH v2 30/28] docs: kdoc_parser: avoid tokenizing structs everytime

Posted by Mauro Carvalho Chehab 3 weeks, 4 days ago

Most of the rules inside CTransforms are of the type CMatch.

Don't re-parse the source code every time.

Doing this doesn't change the output, but makes kdoc almost
as fast as before the tokenizer patches:

    # Before tokenizer patches
    $ time ./scripts/kernel-doc . -man >original 2>&1

    real    0m42.933s
    user    0m36.523s
    sys     0m1.145s

    # After tokenizer patches
    $ time ./scripts/kernel-doc . -man >before 2>&1

    real    1m29.853s
    user    1m23.974s
    sys     0m1.237s

    # After this patch
    $ time ./scripts/kernel-doc . -man >after 2>&1

    real    0m48.579s
    user    0m45.938s
    sys     0m0.988s

    $ diff -s before after
    Files before and after are identical

Manually checked the differences between original and after
with:

    $ diff -U0 -prBw original after|grep -v Warning|grep -v "@@"|less

They're due:
  - whitespace fixes;
  - struct_group are now better handled;
  - several badly-generated man pages from broken inline kernel-doc
    markups are now fixed.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_parser.py  |  1 -
 tools/lib/python/kdoc/xforms_lists.py | 30 +++++++++++++++++++++------
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py
index ed378edb1e05..3b99740ebed3 100644
--- a/tools/lib/python/kdoc/kdoc_parser.py
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -738,7 +738,6 @@ class KernelDoc:
         #
         # Go through the list of members applying all of our transformations.
         #
-        members = trim_private_members(members)
         members = self.xforms.apply("struct", members)
 
         #
diff --git a/tools/lib/python/kdoc/xforms_lists.py b/tools/lib/python/kdoc/xforms_lists.py
index c3c532c45cdc..f6ea9efb11ae 100644
--- a/tools/lib/python/kdoc/xforms_lists.py
+++ b/tools/lib/python/kdoc/xforms_lists.py
@@ -5,7 +5,7 @@
 import re
 
 from kdoc.kdoc_re import KernRe
-from kdoc.c_lex import CMatch
+from kdoc.c_lex import CMatch, CTokenizer
 
 struct_args_pattern = r"([^,)]+)"
 
@@ -17,6 +17,12 @@ class CTransforms:
     into something we can parse and generate kdoc for.
     """
 
+    #
+    # NOTE:
+    #      Due to performance reasons, place CMatch rules before KernRe,
+    #      as this avoids running the C parser every time.
+    #
+
     #: Transforms for structs and unions.
     struct_xforms = [
         (CMatch("__attribute__"), ""),
@@ -123,13 +129,25 @@ class CTransforms:
         "var": var_xforms,
     }
 
-    def apply(self, xforms_type, text):
+    def apply(self, xforms_type, source):
         """
-        Apply a set of transforms to a block of text.
+        Apply a set of transforms to a block of source.
+
+        As tokenizer is used here, this function also remove comments
+        at the end.
         """
         if xforms_type not in self.xforms:
-            return text
+            return source
+
+        if isinstance(source, str):
+            source = CTokenizer(source)
 
         for search, subst in self.xforms[xforms_type]:
-            text = search.sub(subst, text)
-        return text
+            #
+            # KernRe only accept strings.
+            #
+            if isinstance(search, KernRe):
+                source = str(source)
+
+            source = search.sub(subst, source)
+        return str(source)
-- 
2.53.0

RE: [PATCH v2 30/28] docs: kdoc_parser: avoid tokenizing structs everytime

Posted by Loktionov, Aleksandr 3 weeks, 4 days ago


> -----Original Message-----
> From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> Sent: Friday, March 13, 2026 9:34 AM
> To: Jonathan Corbet <corbet@lwn.net>; Linux Doc Mailing List <linux-
> doc@vger.kernel.org>
> Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>; linux-
> kernel@vger.kernel.org; Loktionov, Aleksandr
> <aleksandr.loktionov@intel.com>; Mauro Carvalho Chehab
> <mchehab@kernel.org>; Randy Dunlap <rdunlap@infradead.org>
> Subject: [PATCH v2 30/28] docs: kdoc_parser: avoid tokenizing structs
> everytime
> 
> Most of the rules inside CTransforms are of the type CMatch.
> 
> Don't re-parse the source code every time.
> 
> Doing this doesn't change the output, but makes kdoc almost as fast as
> before the tokenizer patches:
> 
>     # Before tokenizer patches
>     $ time ./scripts/kernel-doc . -man >original 2>&1
> 
>     real    0m42.933s
>     user    0m36.523s
>     sys     0m1.145s
> 
>     # After tokenizer patches
>     $ time ./scripts/kernel-doc . -man >before 2>&1
> 
>     real    1m29.853s
>     user    1m23.974s
>     sys     0m1.237s
> 
>     # After this patch
>     $ time ./scripts/kernel-doc . -man >after 2>&1
> 
>     real    0m48.579s
>     user    0m45.938s
>     sys     0m0.988s
> 
>     $ diff -s before after
>     Files before and after are identical
> 
> Manually checked the differences between original and after
> with:
> 
>     $ diff -U0 -prBw original after|grep -v Warning|grep -v "@@"|less
> 
> They're due:
>   - whitespace fixes;
>   - struct_group are now better handled;
>   - several badly-generated man pages from broken inline kernel-doc
>     markups are now fixed.
> 
> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> ---
>  tools/lib/python/kdoc/kdoc_parser.py  |  1 -
> tools/lib/python/kdoc/xforms_lists.py | 30 +++++++++++++++++++++------
>  2 files changed, 24 insertions(+), 7 deletions(-)
> 
> diff --git a/tools/lib/python/kdoc/kdoc_parser.py
> b/tools/lib/python/kdoc/kdoc_parser.py
> index ed378edb1e05..3b99740ebed3 100644
> --- a/tools/lib/python/kdoc/kdoc_parser.py
> +++ b/tools/lib/python/kdoc/kdoc_parser.py
> @@ -738,7 +738,6 @@ class KernelDoc:
>          #
>          # Go through the list of members applying all of our
> transformations.
>          #
> -        members = trim_private_members(members)
>          members = self.xforms.apply("struct", members)
> 
>          #
> diff --git a/tools/lib/python/kdoc/xforms_lists.py
> b/tools/lib/python/kdoc/xforms_lists.py
> index c3c532c45cdc..f6ea9efb11ae 100644
> --- a/tools/lib/python/kdoc/xforms_lists.py
> +++ b/tools/lib/python/kdoc/xforms_lists.py
> @@ -5,7 +5,7 @@
>  import re
> 
>  from kdoc.kdoc_re import KernRe
> -from kdoc.c_lex import CMatch
> +from kdoc.c_lex import CMatch, CTokenizer
> 
>  struct_args_pattern = r"([^,)]+)"
> 
> @@ -17,6 +17,12 @@ class CTransforms:
>      into something we can parse and generate kdoc for.
>      """
> 
> +    #
> +    # NOTE:
> +    #      Due to performance reasons, place CMatch rules before
> KernRe,
> +    #      as this avoids running the C parser every time.
> +    #
> +
>      #: Transforms for structs and unions.
>      struct_xforms = [
>          (CMatch("__attribute__"), ""),
> @@ -123,13 +129,25 @@ class CTransforms:
>          "var": var_xforms,
>      }
> 
> -    def apply(self, xforms_type, text):
> +    def apply(self, xforms_type, source):
>          """
> -        Apply a set of transforms to a block of text.
> +        Apply a set of transforms to a block of source.
> +
> +        As tokenizer is used here, this function also remove comments
> +        at the end.
>          """
>          if xforms_type not in self.xforms:
> -            return text
> +            return source
> +
> +        if isinstance(source, str):
> +            source = CTokenizer(source)
> 
>          for search, subst in self.xforms[xforms_type]:
> -            text = search.sub(subst, text)
> -        return text
> +            #
> +            # KernRe only accept strings.
> +            #
> +            if isinstance(search, KernRe):
> +                source = str(source)
> +
> +            source = search.sub(subst, source)
> +        return str(source)
> --
> 2.53.0

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>