dump_struct is one of the longest functions in the kdoc_parser class,
making it hard to read and reason about. Move the definition of the prefix
transformations out of the function, join them with the definition of
"attribute" (which was defined at the top of the file but only used here),
and reformat the code slightly for shorter line widths.
Just code movement in the end.
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
scripts/lib/kdoc/kdoc_parser.py | 178 +++++++++++++++++---------------
1 file changed, 96 insertions(+), 82 deletions(-)
diff --git a/scripts/lib/kdoc/kdoc_parser.py b/scripts/lib/kdoc/kdoc_parser.py
index e1efa65a3480..5e375318df9c 100644
--- a/scripts/lib/kdoc/kdoc_parser.py
+++ b/scripts/lib/kdoc/kdoc_parser.py
@@ -54,8 +54,6 @@ doc_inline_start = KernRe(r'^\s*/\*\*\s*$', cache=False)
doc_inline_sect = KernRe(r'\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)', cache=False)
doc_inline_end = KernRe(r'^\s*\*/\s*$', cache=False)
doc_inline_oneline = KernRe(r'^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$', cache=False)
-attribute = KernRe(r"__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)",
- flags=re.I | re.S, cache=False)
export_symbol = KernRe(r'^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*', cache=False)
export_symbol_ns = KernRe(r'^\s*EXPORT_SYMBOL_NS(_GPL)?\s*\(\s*(\w+)\s*,\s*"\S+"\)\s*', cache=False)
@@ -74,6 +72,97 @@ doc_begin_func = KernRe(str(doc_com) + # initial " * '
r'(?:[-:].*)?$', # description (not captured)
cache = False)
+#
+# Here begins a long set of transformations to turn structure member prefixes
+# and macro invocations into something we can parse and generate kdoc for.
+#
+struct_attribute = KernRe(r"__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)",
+ flags=re.I | re.S, cache=False)
+struct_args_pattern = r'([^,)]+)'
+
+struct_prefixes = [
+ # Strip attributes
+ (struct_attribute, ' '),
+ (KernRe(r'\s*__aligned\s*\([^;]*\)', re.S), ' '),
+ (KernRe(r'\s*__counted_by\s*\([^;]*\)', re.S), ' '),
+ (KernRe(r'\s*__counted_by_(le|be)\s*\([^;]*\)', re.S), ' '),
+ (KernRe(r'\s*__packed\s*', re.S), ' '),
+ (KernRe(r'\s*CRYPTO_MINALIGN_ATTR', re.S), ' '),
+ (KernRe(r'\s*____cacheline_aligned_in_smp', re.S), ' '),
+ (KernRe(r'\s*____cacheline_aligned', re.S), ' '),
+ #
+ # Unwrap struct_group macros based on this definition:
+ # __struct_group(TAG, NAME, ATTRS, MEMBERS...)
+ # which has variants like: struct_group(NAME, MEMBERS...)
+ # Only MEMBERS arguments require documentation.
+ #
+ # Parsing them happens on two steps:
+ #
+ # 1. drop struct group arguments that aren't at MEMBERS,
+ # storing them as STRUCT_GROUP(MEMBERS)
+ #
+ # 2. remove STRUCT_GROUP() ancillary macro.
+ #
+ # The original logic used to remove STRUCT_GROUP() using an
+ # advanced regex:
+ #
+ # \bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;
+ #
+ # with two patterns that are incompatible with
+ # Python re module, as it has:
+ #
+ # - a recursive pattern: (?1)
+ # - an atomic grouping: (?>...)
+ #
+ # I tried a simpler version: but it didn't work either:
+ # \bSTRUCT_GROUP\(([^\)]+)\)[^;]*;
+ #
+ # As it doesn't properly match the end parenthesis on some cases.
+ #
+ # So, a better solution was crafted: there's now a NestedMatch
+ # class that ensures that delimiters after a search are properly
+ # matched. So, the implementation to drop STRUCT_GROUP() will be
+ # handled in separate.
+ #
+ (KernRe(r'\bstruct_group\s*\(([^,]*,)', re.S), r'STRUCT_GROUP('),
+ (KernRe(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S), r'STRUCT_GROUP('),
+ (KernRe(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S), r'struct \1 \2; STRUCT_GROUP('),
+ (KernRe(r'\b__struct_group\s*\(([^,]*,){3}', re.S), r'STRUCT_GROUP('),
+ #
+ # Replace macros
+ #
+ # TODO: use NestedMatch for FOO($1, $2, ...) matches
+ #
+ # it is better to also move those to the NestedMatch logic,
+ # to ensure that parenthesis will be properly matched.
+ #
+ (KernRe(r'__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^)]+)\)', re.S),
+ r'DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)'),
+ (KernRe(r'DECLARE_PHY_INTERFACE_MASK\s*\(([^)]+)\)', re.S),
+ r'DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)'),
+ (KernRe(r'DECLARE_BITMAP\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)',
+ re.S), r'unsigned long \1[BITS_TO_LONGS(\2)]'),
+ (KernRe(r'DECLARE_HASHTABLE\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)',
+ re.S), r'unsigned long \1[1 << ((\2) - 1)]'),
+ (KernRe(r'DECLARE_KFIFO\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern +
+ r',\s*' + struct_args_pattern + r'\)', re.S), r'\2 *\1'),
+ (KernRe(r'DECLARE_KFIFO_PTR\s*\(' + struct_args_pattern + r',\s*' +
+ struct_args_pattern + r'\)', re.S), r'\2 *\1'),
+ (KernRe(r'(?:__)?DECLARE_FLEX_ARRAY\s*\(' + struct_args_pattern + r',\s*' +
+ struct_args_pattern + r'\)', re.S), r'\1 \2[]'),
+ (KernRe(r'DEFINE_DMA_UNMAP_ADDR\s*\(' + struct_args_pattern + r'\)', re.S), r'dma_addr_t \1'),
+ (KernRe(r'DEFINE_DMA_UNMAP_LEN\s*\(' + struct_args_pattern + r'\)', re.S), r'__u32 \1'),
+]
+#
+# Regexes here are guaranteed to have the end limiter matching
+# the start delimiter. Yet, right now, only one replace group
+# is allowed.
+#
+struct_nested_prefixes = [
+ (re.compile(r'\bSTRUCT_GROUP\('), r'\1'),
+]
+
+
#
# A little helper to get rid of excess white space
#
@@ -579,90 +668,15 @@ class KernelDoc:
f"expecting prototype for {decl_type} {self.entry.identifier}. Prototype was for {decl_type} {declaration_name} instead\n")
return
- args_pattern = r'([^,)]+)'
-
- sub_prefixes = [
- # Strip attributes
- (attribute, ' '),
- (KernRe(r'\s*__aligned\s*\([^;]*\)', re.S), ' '),
- (KernRe(r'\s*__counted_by\s*\([^;]*\)', re.S), ' '),
- (KernRe(r'\s*__counted_by_(le|be)\s*\([^;]*\)', re.S), ' '),
- (KernRe(r'\s*__packed\s*', re.S), ' '),
- (KernRe(r'\s*CRYPTO_MINALIGN_ATTR', re.S), ' '),
- (KernRe(r'\s*____cacheline_aligned_in_smp', re.S), ' '),
- (KernRe(r'\s*____cacheline_aligned', re.S), ' '),
-
- # Unwrap struct_group macros based on this definition:
- # __struct_group(TAG, NAME, ATTRS, MEMBERS...)
- # which has variants like: struct_group(NAME, MEMBERS...)
- # Only MEMBERS arguments require documentation.
- #
- # Parsing them happens on two steps:
- #
- # 1. drop struct group arguments that aren't at MEMBERS,
- # storing them as STRUCT_GROUP(MEMBERS)
- #
- # 2. remove STRUCT_GROUP() ancillary macro.
- #
- # The original logic used to remove STRUCT_GROUP() using an
- # advanced regex:
- #
- # \bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;
- #
- # with two patterns that are incompatible with
- # Python re module, as it has:
- #
- # - a recursive pattern: (?1)
- # - an atomic grouping: (?>...)
- #
- # I tried a simpler version: but it didn't work either:
- # \bSTRUCT_GROUP\(([^\)]+)\)[^;]*;
- #
- # As it doesn't properly match the end parenthesis on some cases.
- #
- # So, a better solution was crafted: there's now a NestedMatch
- # class that ensures that delimiters after a search are properly
- # matched. So, the implementation to drop STRUCT_GROUP() will be
- # handled in separate.
-
- (KernRe(r'\bstruct_group\s*\(([^,]*,)', re.S), r'STRUCT_GROUP('),
- (KernRe(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S), r'STRUCT_GROUP('),
- (KernRe(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S), r'struct \1 \2; STRUCT_GROUP('),
- (KernRe(r'\b__struct_group\s*\(([^,]*,){3}', re.S), r'STRUCT_GROUP('),
-
- # Replace macros
- #
- # TODO: use NestedMatch for FOO($1, $2, ...) matches
- #
- # it is better to also move those to the NestedMatch logic,
- # to ensure that parenthesis will be properly matched.
-
- (KernRe(r'__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^)]+)\)', re.S), r'DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)'),
- (KernRe(r'DECLARE_PHY_INTERFACE_MASK\s*\(([^)]+)\)', re.S), r'DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)'),
- (KernRe(r'DECLARE_BITMAP\s*\(' + args_pattern + r',\s*' + args_pattern + r'\)', re.S), r'unsigned long \1[BITS_TO_LONGS(\2)]'),
- (KernRe(r'DECLARE_HASHTABLE\s*\(' + args_pattern + r',\s*' + args_pattern + r'\)', re.S), r'unsigned long \1[1 << ((\2) - 1)]'),
- (KernRe(r'DECLARE_KFIFO\s*\(' + args_pattern + r',\s*' + args_pattern + r',\s*' + args_pattern + r'\)', re.S), r'\2 *\1'),
- (KernRe(r'DECLARE_KFIFO_PTR\s*\(' + args_pattern + r',\s*' + args_pattern + r'\)', re.S), r'\2 *\1'),
- (KernRe(r'(?:__)?DECLARE_FLEX_ARRAY\s*\(' + args_pattern + r',\s*' + args_pattern + r'\)', re.S), r'\1 \2[]'),
- (KernRe(r'DEFINE_DMA_UNMAP_ADDR\s*\(' + args_pattern + r'\)', re.S), r'dma_addr_t \1'),
- (KernRe(r'DEFINE_DMA_UNMAP_LEN\s*\(' + args_pattern + r'\)', re.S), r'__u32 \1'),
- ]
-
- # Regexes here are guaranteed to have the end limiter matching
- # the start delimiter. Yet, right now, only one replace group
- # is allowed.
-
- sub_nested_prefixes = [
- (re.compile(r'\bSTRUCT_GROUP\('), r'\1'),
- ]
-
+ #
+ # Go through the list of members applying all of our transformations.
+ #
members = trim_private_members(members)
- for search, sub in sub_prefixes:
+ for search, sub in struct_prefixes:
members = search.sub(sub, members)
nested = NestedMatch()
-
- for search, sub in sub_nested_prefixes:
+ for search, sub in struct_nested_prefixes:
members = nested.sub(search, sub, members)
# Keeps the original declaration as-is
--
2.50.1
Em Thu, 31 Jul 2025 18:13:18 -0600 Jonathan Corbet <corbet@lwn.net> escreveu: > dump_struct is one of the longest functions in the kdoc_parser class, > making it hard to read and reason about. Move the definition of the prefix > transformations out of the function, join them with the definition of > "attribute" (which was defined at the top of the file but only used here), > and reformat the code slightly for shorter line widths. > > Just code movement in the end. This patch itself LGTM: Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org> but see my notes below: > +struct_prefixes = [ > + # Strip attributes > + (struct_attribute, ' '), > + (KernRe(r'\s*__aligned\s*\([^;]*\)', re.S), ' '), > + (KernRe(r'\s*__counted_by\s*\([^;]*\)', re.S), ' '), > + (KernRe(r'\s*__counted_by_(le|be)\s*\([^;]*\)', re.S), ' '), > + (KernRe(r'\s*__packed\s*', re.S), ' '), > + (KernRe(r'\s*CRYPTO_MINALIGN_ATTR', re.S), ' '), > + (KernRe(r'\s*____cacheline_aligned_in_smp', re.S), ' '), > + (KernRe(r'\s*____cacheline_aligned', re.S), ' '), > + # > + # Unwrap struct_group macros based on this definition: > + # __struct_group(TAG, NAME, ATTRS, MEMBERS...) > + # which has variants like: struct_group(NAME, MEMBERS...) > + # Only MEMBERS arguments require documentation. > + # > + # Parsing them happens on two steps: > + # > + # 1. drop struct group arguments that aren't at MEMBERS, > + # storing them as STRUCT_GROUP(MEMBERS) > + # > + # 2. remove STRUCT_GROUP() ancillary macro. > + # > + # The original logic used to remove STRUCT_GROUP() using an > + # advanced regex: > + # > + # \bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*; > + # > + # with two patterns that are incompatible with > + # Python re module, as it has: > + # > + # - a recursive pattern: (?1) > + # - an atomic grouping: (?>...) > + # > + # I tried a simpler version: but it didn't work either: > + # \bSTRUCT_GROUP\(([^\)]+)\)[^;]*; > + # > + # As it doesn't properly match the end parenthesis on some cases. > + # > + # So, a better solution was crafted: there's now a NestedMatch > + # class that ensures that delimiters after a search are properly > + # matched. So, the implementation to drop STRUCT_GROUP() will be > + # handled in separate. > + # > + (KernRe(r'\bstruct_group\s*\(([^,]*,)', re.S), r'STRUCT_GROUP('), > + (KernRe(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S), r'STRUCT_GROUP('), > + (KernRe(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S), r'struct \1 \2; STRUCT_GROUP('), > + (KernRe(r'\b__struct_group\s*\(([^,]*,){3}', re.S), r'STRUCT_GROUP('), > + # > + # Replace macros > + # > + # TODO: use NestedMatch for FOO($1, $2, ...) matches This comment is actually related to patch 03/12: regex cleanups: If you want to simplify a lot the regular expressions here, the best is to take a look at the NestedMatch class and improve it. There are lots of regular expressions here that are very complex because they try to ensure that something like these: 1. function(<arg1>) 2. function(<arg1>, <arg2>,<arg3>,...) are properly parsed[1], but if we turn it into something that handle (2) as well, we could use it like: match = NestedMatch.search("function", string) # or, alternatively: # match = NestedMatch.search("function($1, $2, $3)", string) if match: arg1 = match.group(1) arg2 = match.group(2) arg3 = match.group(3) or even do more complex changes like: NestedMatch.sub("foo($1, $2)", "new_name($2)", string) A class implementing that will help to transform all sorts of functions and simplify the more complex regexes on kernel-doc. Doing that will very likely simplify a lot the struct_prefixes, replacing it by something a lot more easier to understand: # Nice and simpler set of replacement rules struct_nested_matches = [ ("__aligned", ""), ("__counted_by", ""), ("__counted_by_(be|le)", ""), ... # Picked those from stddef.h macro replacement rules ("struct_group(NAME, MEMBERS...)", "__struct_group(, NAME, , MEMBERS)"), ("struct_group(TAG, NAME, ATTRS, MEMBERS...)", """ __struct_group(TAG, NAME, ATTRS, MEMBERS...) union { struct { MEMBERS } ATTRS; struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; } ATTRS"""), ... ] members = trim_private_members(members) for from, to in struct_nested_matches: members = NestedMatch.sub(from, to, members) Granted, wiring this up takes some time and lots of testing - we should likely have some unit tests to catch issues there - but IMO it is worth the effort. - [1] NestedMatch() is currently limited to match function(<args>), as it was written to replace really complex regular expressions with recursive patterns and atomic grouping, that were used only to capture macro calls for: STRUCT_GROUP(...) I might have used instead "import regex", but I didn't want to add the extra dependency of a non-standard Python library at the Kernel build. Thanks, Mauro
Em Fri, 1 Aug 2025 07:28:41 +0200 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> escreveu: > Em Thu, 31 Jul 2025 18:13:18 -0600 > Jonathan Corbet <corbet@lwn.net> escreveu: > > > dump_struct is one of the longest functions in the kdoc_parser class, > > making it hard to read and reason about. Move the definition of the prefix > > transformations out of the function, join them with the definition of > > "attribute" (which was defined at the top of the file but only used here), > > and reformat the code slightly for shorter line widths. > > > > Just code movement in the end. > > This patch itself LGTM: > > Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org> In time, my R-B from patch 4 and above assumes that patch 3 is dropped, as I'm not re-checking the regular expressions. > > but see my notes below: > > > +struct_prefixes = [ > > + # Strip attributes > > + (struct_attribute, ' '), > > + (KernRe(r'\s*__aligned\s*\([^;]*\)', re.S), ' '), > > + (KernRe(r'\s*__counted_by\s*\([^;]*\)', re.S), ' '), > > + (KernRe(r'\s*__counted_by_(le|be)\s*\([^;]*\)', re.S), ' '), > > + (KernRe(r'\s*__packed\s*', re.S), ' '), > > + (KernRe(r'\s*CRYPTO_MINALIGN_ATTR', re.S), ' '), > > + (KernRe(r'\s*____cacheline_aligned_in_smp', re.S), ' '), > > + (KernRe(r'\s*____cacheline_aligned', re.S), ' '), > > + # > > + # Unwrap struct_group macros based on this definition: > > + # __struct_group(TAG, NAME, ATTRS, MEMBERS...) > > + # which has variants like: struct_group(NAME, MEMBERS...) > > + # Only MEMBERS arguments require documentation. > > + # > > + # Parsing them happens on two steps: > > + # > > + # 1. drop struct group arguments that aren't at MEMBERS, > > + # storing them as STRUCT_GROUP(MEMBERS) > > + # > > + # 2. remove STRUCT_GROUP() ancillary macro. > > + # > > + # The original logic used to remove STRUCT_GROUP() using an > > + # advanced regex: > > + # > > + # \bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*; > > + # > > + # with two patterns that are incompatible with > > + # Python re module, as it has: > > + # > > + # - a recursive pattern: (?1) > > + # - an atomic grouping: (?>...) > > + # > > + # I tried a simpler version: but it didn't work either: > > + # \bSTRUCT_GROUP\(([^\)]+)\)[^;]*; > > + # > > + # As it doesn't properly match the end parenthesis on some cases. > > + # > > + # So, a better solution was crafted: there's now a NestedMatch > > + # class that ensures that delimiters after a search are properly > > + # matched. So, the implementation to drop STRUCT_GROUP() will be > > + # handled in separate. > > + # > > + (KernRe(r'\bstruct_group\s*\(([^,]*,)', re.S), r'STRUCT_GROUP('), > > + (KernRe(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S), r'STRUCT_GROUP('), > > + (KernRe(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S), r'struct \1 \2; STRUCT_GROUP('), > > + (KernRe(r'\b__struct_group\s*\(([^,]*,){3}', re.S), r'STRUCT_GROUP('), > > + # > > + # Replace macros > > + # > > + # TODO: use NestedMatch for FOO($1, $2, ...) matches > > This comment is actually related to patch 03/12: regex cleanups: > > If you want to simplify a lot the regular expressions here, the best > is to take a look at the NestedMatch class and improve it. There are lots > of regular expressions here that are very complex because they try > to ensure that something like these: > > 1. function(<arg1>) > 2. function(<arg1>, <arg2>,<arg3>,...) > > are properly parsed[1], but if we turn it into something that handle (2) as > well, we could use it like: > > match = NestedMatch.search("function", string) > # or, alternatively: > # match = NestedMatch.search("function($1, $2, $3)", string) > > if match: > arg1 = match.group(1) > arg2 = match.group(2) > arg3 = match.group(3) > > or even do more complex changes like: > > NestedMatch.sub("foo($1, $2)", "new_name($2)", string) > > A class implementing that will help to transform all sorts of functions > and simplify the more complex regexes on kernel-doc. Doing that will > very likely simplify a lot the struct_prefixes, replacing it by something > a lot more easier to understand: > > # Nice and simpler set of replacement rules > struct_nested_matches = [ > ("__aligned", ""), > ("__counted_by", ""), > ("__counted_by_(be|le)", ""), > ... > # Picked those from stddef.h macro replacement rules > ("struct_group(NAME, MEMBERS...)", "__struct_group(, NAME, , MEMBERS)"), > ("struct_group(TAG, NAME, ATTRS, MEMBERS...)", > """ __struct_group(TAG, NAME, ATTRS, MEMBERS...) > union { > struct { MEMBERS } ATTRS; > struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; > } ATTRS"""), > ... > ] > > members = trim_private_members(members) > for from, to in struct_nested_matches: > members = NestedMatch.sub(from, to, members) > > Granted, wiring this up takes some time and lots of testing - we should > likely have some unit tests to catch issues there - but IMO it is > worth the effort. > > - > > [1] NestedMatch() is currently limited to match function(<args>), as it was > written to replace really complex regular expressions with > recursive patterns and atomic grouping, that were used only to > capture macro calls for: > > STRUCT_GROUP(...) > > I might have used instead "import regex", but I didn't want to add the > extra dependency of a non-standard Python library at the Kernel build. > > Thanks, > Mauro Thanks, Mauro
© 2016 - 2025 Red Hat, Inc.