Get rid of some single-use variables and redundant checks, and generally
tighten up the code; no logical change.
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
scripts/lib/kdoc/kdoc_parser.py | 89 ++++++++++++++++-----------------
1 file changed, 42 insertions(+), 47 deletions(-)
diff --git a/scripts/lib/kdoc/kdoc_parser.py b/scripts/lib/kdoc/kdoc_parser.py
index 20e0a2abe13b..2b7d7e646367 100644
--- a/scripts/lib/kdoc/kdoc_parser.py
+++ b/scripts/lib/kdoc/kdoc_parser.py
@@ -673,73 +673,68 @@ class KernelDoc:
while tuples:
for t in tuples:
newmember = ""
- maintype = t[0]
- s_ids = t[5]
- content = t[3]
-
- oldmember = "".join(t)
-
- for s_id in s_ids.split(','):
+ oldmember = "".join(t) # Reconstruct the original formatting
+ #
+ # Pass through each field name, normalizing the form and formatting.
+ #
+ for s_id in t[5].split(','):
s_id = s_id.strip()
- newmember += f"{maintype} {s_id}; "
+ newmember += f"{t[0]} {s_id}; "
+ #
+ # Remove bitfield/array/pointer info, getting the bare name.
+ #
s_id = KernRe(r'[:[].*').sub('', s_id)
s_id = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', s_id)
-
- for arg in content.split(';'):
+ #
+ # Pass through the members of this inner structure/union.
+ #
+ for arg in t[3].split(';'):
arg = arg.strip()
-
- if not arg:
- continue
-
+ #
+ # Look for (type)(*name)(args) - pointer to function
+ #
r = KernRe(r'^([^(]+\(\*?\s*)([\w.]*)(\s*\).*)')
if r.match(arg):
# Pointer-to-function
- dtype = r.group(1)
- name = r.group(2)
- extra = r.group(3)
-
- if not name:
- continue
-
if not s_id:
# Anonymous struct/union
- newmember += f"{dtype}{name}{extra}; "
+ newmember += f"{r.group(1)}{r.group(2)}{r.group(3)}; "
else:
- newmember += f"{dtype}{s_id}.{name}{extra}; "
-
+ newmember += f"{r.group(1)}{s_id}.{r.group(2)}{r.group(3)}; "
+ #
+ # Otherwise a non-function member.
+ #
else:
- # Handle bitmaps
+ #
+ # Remove bitmap and array portions and spaces around commas
+ #
arg = KernRe(r':\s*\d+\s*').sub('', arg)
-
- # Handle arrays
arg = KernRe(r'\[.*\]').sub('', arg)
-
- # Handle multiple IDs
arg = KernRe(r'\s*,\s*').sub(',', arg)
-
+ #
+ # Look for a normal decl - "type name[,name...]"
+ #
r = KernRe(r'(.*)\s+([\S+,]+)')
-
if r.search(arg):
- dtype = r.group(1)
- names = r.group(2)
+ for name in r.group(2).split(','):
+ name = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', name)
+ if not s_id:
+ # Anonymous struct/union
+ newmember += f"{r.group(1)} {name}; "
+ else:
+ newmember += f"{r.group(1)} {s_id}.{name}; "
else:
newmember += f"{arg}; "
- continue
-
- for name in names.split(','):
- name = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', name).strip()
-
- if not name:
- continue
-
- if not s_id:
- # Anonymous struct/union
- newmember += f"{dtype} {name}; "
- else:
- newmember += f"{dtype} {s_id}.{name}; "
-
+ #
+ # At the end of the s_id loop, replace the original declaration with
+ # the munged version.
+ #
members = members.replace(oldmember, newmember)
+ #
+ # End of the tuple loop - search again and see if there are outer members
+ # that now turn up.
+ #
tuples = struct_members.findall(members)
return members
--
2.50.1
Em Thu, 31 Jul 2025 18:13:24 -0600 Jonathan Corbet <corbet@lwn.net> escreveu: > Get rid of some single-use variables and redundant checks, and generally > tighten up the code; no logical change. > > Signed-off-by: Jonathan Corbet <corbet@lwn.net> > --- > scripts/lib/kdoc/kdoc_parser.py | 89 ++++++++++++++++----------------- > 1 file changed, 42 insertions(+), 47 deletions(-) > > diff --git a/scripts/lib/kdoc/kdoc_parser.py b/scripts/lib/kdoc/kdoc_parser.py > index 20e0a2abe13b..2b7d7e646367 100644 > --- a/scripts/lib/kdoc/kdoc_parser.py > +++ b/scripts/lib/kdoc/kdoc_parser.py > @@ -673,73 +673,68 @@ class KernelDoc: > while tuples: > for t in tuples: > newmember = "" > - maintype = t[0] > - s_ids = t[5] > - content = t[3] The reason I opted for this particular approach... > - > - oldmember = "".join(t) > - > - for s_id in s_ids.split(','): > + oldmember = "".join(t) # Reconstruct the original formatting > + # > + # Pass through each field name, normalizing the form and formatting. > + # > + for s_id in t[5].split(','): ... is that it is easier to understand and to maintain: for s_id in s_ids.split(','): than when magic numbers like this are used: for s_id in t[5].split(','): > s_id = s_id.strip() > > - newmember += f"{maintype} {s_id}; " > + newmember += f"{t[0]} {s_id}; " > + # > + # Remove bitfield/array/pointer info, getting the bare name. > + # > s_id = KernRe(r'[:[].*').sub('', s_id) > s_id = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', s_id) > - > - for arg in content.split(';'): > + # > + # Pass through the members of this inner structure/union. > + # > + for arg in t[3].split(';'): Here, for example, we're far away from the tuple definition... I can't recall anymore what "3" magic number means ;-) > arg = arg.strip() > - > - if not arg: > - continue > - > + # > + # Look for (type)(*name)(args) - pointer to function > + # > r = KernRe(r'^([^(]+\(\*?\s*)([\w.]*)(\s*\).*)') > if r.match(arg): > # Pointer-to-function > - dtype = r.group(1) > - name = r.group(2) > - extra = r.group(3) Same applies here. Having a named var makes easier to understand/maintain rest of the code. If you're willing to do something like that, better to use named capture groups, like: r = KernRe(r'^(?P<dtype>[^(]+\(\*?\s*)' r'(?P<name>[\w.]*)' r'(?P<extra>\s*\).*)') together with a syntax using match.group(group_name) I'm not a particular fan of named groups, as it adds a lot more stuff at regexes. They're already hard enough to understand without ?P<name>, but at least match.group('dtype'), match.group('name'), match.group('extra') inside the next calls would be easier to maintain than when using magic numbers. Same comments apply to other changes below. > - > - if not name: > - continue > - > if not s_id: > # Anonymous struct/union > - newmember += f"{dtype}{name}{extra}; " > + newmember += f"{r.group(1)}{r.group(2)}{r.group(3)}; " > else: > - newmember += f"{dtype}{s_id}.{name}{extra}; " > - > + newmember += f"{r.group(1)}{s_id}.{r.group(2)}{r.group(3)}; " > + # > + # Otherwise a non-function member. > + # > else: > - # Handle bitmaps > + # > + # Remove bitmap and array portions and spaces around commas > + # > arg = KernRe(r':\s*\d+\s*').sub('', arg) > - > - # Handle arrays > arg = KernRe(r'\[.*\]').sub('', arg) > - > - # Handle multiple IDs > arg = KernRe(r'\s*,\s*').sub(',', arg) > - > + # > + # Look for a normal decl - "type name[,name...]" > + # > r = KernRe(r'(.*)\s+([\S+,]+)') > - > if r.search(arg): > - dtype = r.group(1) > - names = r.group(2) > + for name in r.group(2).split(','): > + name = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', name) > + if not s_id: > + # Anonymous struct/union > + newmember += f"{r.group(1)} {name}; " > + else: > + newmember += f"{r.group(1)} {s_id}.{name}; " > else: > newmember += f"{arg}; " > - continue > - > - for name in names.split(','): > - name = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', name).strip() > - > - if not name: > - continue > - > - if not s_id: > - # Anonymous struct/union > - newmember += f"{dtype} {name}; " > - else: > - newmember += f"{dtype} {s_id}.{name}; " > - > + # > + # At the end of the s_id loop, replace the original declaration with > + # the munged version. > + # > members = members.replace(oldmember, newmember) > + # > + # End of the tuple loop - search again and see if there are outer members > + # that now turn up. > + # > tuples = struct_members.findall(members) > return members > Thanks, Mauro
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes: > Em Thu, 31 Jul 2025 18:13:24 -0600 > Jonathan Corbet <corbet@lwn.net> escreveu: > >> Get rid of some single-use variables and redundant checks, and generally >> tighten up the code; no logical change. >> >> Signed-off-by: Jonathan Corbet <corbet@lwn.net> >> --- >> scripts/lib/kdoc/kdoc_parser.py | 89 ++++++++++++++++----------------- >> 1 file changed, 42 insertions(+), 47 deletions(-) >> >> diff --git a/scripts/lib/kdoc/kdoc_parser.py b/scripts/lib/kdoc/kdoc_parser.py >> index 20e0a2abe13b..2b7d7e646367 100644 >> --- a/scripts/lib/kdoc/kdoc_parser.py >> +++ b/scripts/lib/kdoc/kdoc_parser.py >> @@ -673,73 +673,68 @@ class KernelDoc: >> while tuples: >> for t in tuples: >> newmember = "" >> - maintype = t[0] >> - s_ids = t[5] >> - content = t[3] > > The reason I opted for this particular approach... >> - >> - oldmember = "".join(t) >> - >> - for s_id in s_ids.split(','): >> + oldmember = "".join(t) # Reconstruct the original formatting >> + # >> + # Pass through each field name, normalizing the form and formatting. >> + # >> + for s_id in t[5].split(','): > > ... is that it is easier to understand and to maintain: > > for s_id in s_ids.split(','): > > than when magic numbers like this are used: > > for s_id in t[5].split(','): Coming into this code, I had a different experience, and found the variables to just be a layer of indirection I had to pass through to get to the capture groups and see what was really going on. That was part of why I put the group numbers in the comments next to that gnarly regex, to make that mapping more direct and easier to understand. I will not insist on this change either - at least not indefinitely :) I do feel, though, that adding a step between the regex and its use just serves to obscure things. (And yes, I don't really think that named groups make things better. I've found those useful in situations where multiple regexes are in use and the ordering of the groups may vary, but otherwise have generally avoided them). Thanks, jon
Em Fri, 01 Aug 2025 16:52:33 -0600 Jonathan Corbet <corbet@lwn.net> escreveu: > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes: > > > Em Thu, 31 Jul 2025 18:13:24 -0600 > > Jonathan Corbet <corbet@lwn.net> escreveu: > > > >> Get rid of some single-use variables and redundant checks, and generally > >> tighten up the code; no logical change. > >> > >> Signed-off-by: Jonathan Corbet <corbet@lwn.net> > >> --- > >> scripts/lib/kdoc/kdoc_parser.py | 89 ++++++++++++++++----------------- > >> 1 file changed, 42 insertions(+), 47 deletions(-) > >> > >> diff --git a/scripts/lib/kdoc/kdoc_parser.py b/scripts/lib/kdoc/kdoc_parser.py > >> index 20e0a2abe13b..2b7d7e646367 100644 > >> --- a/scripts/lib/kdoc/kdoc_parser.py > >> +++ b/scripts/lib/kdoc/kdoc_parser.py > >> @@ -673,73 +673,68 @@ class KernelDoc: > >> while tuples: > >> for t in tuples: > >> newmember = "" > >> - maintype = t[0] > >> - _ids = t[5]s > >> - content = t[3] > > > > The reason I opted for this particular approach... > >> - > >> - oldmember = "".join(t) > >> - > >> - for s_id in s_ids.split(','): > >> + oldmember = "".join(t) # Reconstruct the original formatting > >> + # > >> + # Pass through each field name, normalizing the form and formatting. > >> + # > >> + for s_id in t[5].split(','): > > > > ... is that it is easier to understand and to maintain: > > > > for s_id in s_ids.split(','): > > > > than when magic numbers like this are used: > > > > for s_id in t[5].split(','): > > Coming into this code, I had a different experience, and found the > variables to just be a layer of indirection I had to pass through to get > to the capture groups and see what was really going on. That was part > of why I put the group numbers in the comments next to that gnarly > regex, to make that mapping more direct and easier to understand. > > I will not insist on this change either - at least not indefinitely :) > I do feel, though, that adding a step between the regex and its use just > serves to obscure things. > > (And yes, I don't really think that named groups make things better. > I've found those useful in situations where multiple regexes are in use > and the ordering of the groups may vary, but otherwise have generally > avoided them). I'd say that, when the magic number is within up to 3-lines hunk distance - e.g. if all of them will appear at the same hunk, it is probably safe to use, but when it gets far away, it makes more harm than good. Perhaps one alternative would do something like: tuples = struct_members.findall(members) if not tuples: break maintype, -, -, content, -, s_ids = tuples (assuming that we don't need t[1], t[2] and t[4] here) Btw, on this specific case, better to use non-capture group matches to avoid those "empty" spaces, e.g. (if I got it right): # Curly Brackets are not captured struct_members = KernRe(type_pattern + # Capture main type r'([^\{\};]+)' + r'(?:\{)' + r'(?:[^\{\}]*)' + # Capture content r'(?:\})' + r'([^\{\}\;]*)(\;)') # Capture IDs ... tuples = struct_members.findall(members) if not tuples: break maintype, content, s_ids = tuples Btw, a cleanup like the above is, IMHO, another good reason why not using magic numbers: people may end fixing match groups to use non-capture matches, but end forgetting to fix some hidden magic numbers. It is hard for a reviewer to see it if the affected magic numbers aren't within the 3 lines range of a default unified diff, and may introduce hidden bugs. Thanks, Mauro
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes: > Perhaps one alternative would do something like: > > tuples = struct_members.findall(members) > if not tuples: > break > > maintype, -, -, content, -, s_ids = tuples > > (assuming that we don't need t[1], t[2] and t[4] here) > > Btw, on this specific case, better to use non-capture group matches > to avoid those "empty" spaces, e.g. (if I got it right): The problem is this line here: oldmember = "".join(t) # Reconstruct the original formatting The regex *has* to capture the entire match string so that it can be reconstructed back to its original form, which we need to edit the full list of members later on. This code could use a deep rethink, but it works for now :) Thanks, jon
Em Tue, 05 Aug 2025 16:46:10 -0600 Jonathan Corbet <corbet@lwn.net> escreveu: > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes: > > > Perhaps one alternative would do something like: > > > > tuples = struct_members.findall(members) > > if not tuples: > > break > > > > maintype, -, -, content, -, s_ids = tuples > > > > (assuming that we don't need t[1], t[2] and t[4] here) > > > > Btw, on this specific case, better to use non-capture group matches > > to avoid those "empty" spaces, e.g. (if I got it right): > > The problem is this line here: > > oldmember = "".join(t) # Reconstruct the original formatting > > The regex *has* to capture the entire match string so that it can be > reconstructed back to its original form, which we need to edit the full > list of members later on. > > This code could use a deep rethink, but it works for now :) well, we can still do: for t in tuples: maintype, -, -, content, -, s_ids = t oldmember = "".join(t) this way, we'll be naming the relevant parameters and reconstructing the the original form. IMO, this is a lot better than using t[0], t[3], t[5] at the code, as the names makes it clear what each one actually captured. - Btw, while re.findall() has an API that doesn't return match objects which is incoherent with the normal re API, while looking at the specs today(*), there is an alternative: re.finditer(). We could add it to KernRE cass and use it on a way that it will use a Match instance. Something like: # Original regex expression res = Re.finditer(...) # Not much difference here. Probably not worh using it for match in res: oldmember = "".join(match.groups()) maintype, -, -, content, -, s_ids = match.groups() Or alternatively: res = Re.finditer(...) # Not much difference here. Probably not worth using it for match in res: oldmember = "".join(match.groups()) # replace at the code below: # maintype -> match.group('maintype') # content -> match.group('content') # s_ids -> match.group('s_ids') No idea about performance differences between findall and finditer. (*) https://docs.python.org/3/library/re.html btw, one possibility that would avoid having tuples and t is to do something like: struct_members = KernRe("(" + # group 1: the entire pattern type_pattern + # Capture main type r'([^\{\};]+)' + r'(?:\{)' + r'(?:[^\{\}]*)' + # Capture content r'(?:\})' + r'([^\{\};]*)(;)') # Capture IDs ")") match = struct_members.finditer(line) for match in res: oldmember, maintype, content, s_ids = match.groups() (disclaimer notice: none of the above was tested) Thanks, Mauro
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes: >> > Btw, on this specific case, better to use non-capture group matches >> > to avoid those "empty" spaces, e.g. (if I got it right): >> >> The problem is this line here: >> >> oldmember = "".join(t) # Reconstruct the original formatting >> >> The regex *has* to capture the entire match string so that it can be >> reconstructed back to its original form, which we need to edit the full >> list of members later on. >> >> This code could use a deep rethink, but it works for now :) > > well, we can still do: > > for t in tuples: > maintype, -, -, content, -, s_ids = t > oldmember = "".join(t) > > this way, we'll be naming the relevant parameters and reconstructing > the the original form. I've already made a change much like that (the "-" syntax doesn't work, of course); I hope to post the updates series today, but it's going to be busy. > Btw, while re.findall() has an API that doesn't return match > objects which is incoherent with the normal re API, while looking > at the specs today(*), there is an alternative: re.finditer(). > We could add it to KernRE cass and use it on a way that it will use > a Match instance. Something like: > > # Original regex expression > res = Re.finditer(...) > [...] A definite possible improvement for later... :) Thanks, jon
Em Wed, 06 Aug 2025 07:00:19 -0600 Jonathan Corbet <corbet@lwn.net> escreveu: > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes: > > >> > Btw, on this specific case, better to use non-capture group matches > >> > to avoid those "empty" spaces, e.g. (if I got it right): > >> > >> The problem is this line here: > >> > >> oldmember = "".join(t) # Reconstruct the original formatting > >> > >> The regex *has* to capture the entire match string so that it can be > >> reconstructed back to its original form, which we need to edit the full > >> list of members later on. > >> > >> This code could use a deep rethink, but it works for now :) > > > > well, we can still do: > > > > for t in tuples: > > maintype, -, -, content, -, s_ids = t > > oldmember = "".join(t) > > > > this way, we'll be naming the relevant parameters and reconstructing > > the the original form. > > I've already made a change much like that (the "-" syntax doesn't work, > of course); Gah! sorry for the typo. I meant to say: maintype, _, _, content, _, s_ids = t > I hope to post the updates series today, but it's going to > be busy. > > > Btw, while re.findall() has an API that doesn't return match > > objects which is incoherent with the normal re API, while looking > > at the specs today(*), there is an alternative: re.finditer(). > > We could add it to KernRE cass and use it on a way that it will use > > a Match instance. Something like: > > > > # Original regex expression > > res = Re.finditer(...) > > [...] > > A definite possible improvement for later... :) Ok. > > Thanks, > > jon Thanks, Mauro
© 2016 - 2025 Red Hat, Inc.