[v2] Improve ABI documentation generation

[RFC v2 18/38] docs: sphinx/kernel_abi: use AbiParser directly

Posted by Mauro Carvalho Chehab 1 year ago

Instead of running get_abi.py script, import AbiParser class and
handle messages directly there using an interactor. This shold save some
memory, as there's no need to exec python inside the Sphinx python
extension.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 Documentation/sphinx/kernel_abi.py | 26 +++++++++++++++-----------
 scripts/get_abi.py                 |  2 +-
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py
index fc7500fad119..93d537d8cb6c 100644
--- a/Documentation/sphinx/kernel_abi.py
+++ b/Documentation/sphinx/kernel_abi.py
@@ -42,6 +42,11 @@ from docutils.parsers.rst import directives, Directive
 from sphinx.util.docutils import switch_source_input
 from sphinx.util import logging
 
+srctree = os.path.abspath(os.environ["srctree"])
+sys.path.insert(0, os.path.join(srctree, "scripts"))
+
+from get_abi import AbiParser
+
 __version__ = "1.0"
 
 
@@ -65,7 +70,7 @@ class KernelCmd(Directive):
     logger = logging.getLogger('kernel_abi')
 
     option_spec = {
-        "debug"     : directives.flag,
+        "debug": directives.flag,
     }
 
     def run(self):
@@ -73,18 +78,17 @@ class KernelCmd(Directive):
         if not doc.settings.file_insertion_enabled:
             raise self.warning("docutils: file insertion disabled")
 
-        srctree = os.path.abspath(os.environ["srctree"])
+        path = os.path.join(srctree, "Documentation", self.arguments[0])
+        parser = AbiParser(path, logger=self.logger)
+        parser.parse_abi()
+        parser.check_issues()
 
-        args = [
-            os.path.join(srctree, 'scripts/get_abi.py'),
-            '-D', os.path.join(srctree, 'Documentation', self.arguments[0]),
-            'rest',
-            '--enable-lineno',
-        ]
+        msg = ""
+        for m in parser.doc(enable_lineno=True, show_file=True):
+            msg += m
 
-        lines = subprocess.check_output(args, cwd=os.path.dirname(doc.current_source)).decode('utf-8')
-        nodeList = self.nestedParse(lines, self.arguments[0])
-        return nodeList
+        node = self.nested_parse(msg, self.arguments[0])
+        return node
 
     def nested_parse(self, lines, fname):
         env = self.state.document.settings.env
diff --git a/scripts/get_abi.py b/scripts/get_abi.py
index 2aec1f9dc5aa..3a8dcff85dc2 100755
--- a/scripts/get_abi.py
+++ b/scripts/get_abi.py
@@ -441,7 +441,7 @@ class AbiParser:
 
         return new_desc + "\n\n"
 
-    def doc(self, enable_lineno, output_in_txt, show_file=False):
+    def doc(self, enable_lineno=False, output_in_txt=False, show_file=False):
         """Print ABI at stdout"""
 
         part = None
-- 
2.48.1

Re: [RFC v2 18/38] docs: sphinx/kernel_abi: use AbiParser directly

Posted by Jonathan Corbet 1 year ago

Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes:

> Instead of running get_abi.py script, import AbiParser class and
> handle messages directly there using an interactor. This shold save some
> memory, as there's no need to exec python inside the Sphinx python
> extension.
>
> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> ---
>  Documentation/sphinx/kernel_abi.py | 26 +++++++++++++++-----------
>  scripts/get_abi.py                 |  2 +-
>  2 files changed, 16 insertions(+), 12 deletions(-)
>
> diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py
> index fc7500fad119..93d537d8cb6c 100644
> --- a/Documentation/sphinx/kernel_abi.py
> +++ b/Documentation/sphinx/kernel_abi.py
> @@ -42,6 +42,11 @@ from docutils.parsers.rst import directives, Directive
>  from sphinx.util.docutils import switch_source_input
>  from sphinx.util import logging
>  
> +srctree = os.path.abspath(os.environ["srctree"])
> +sys.path.insert(0, os.path.join(srctree, "scripts"))
> +
> +from get_abi import AbiParser

I have to admit that I don't like this bit of messing around with the
path.  And importing things out of scripts/ seems ... inelegant.

I take it you still want to be able to run get_abi.py standalone even
after it's directly integrated into Sphinx?  In this case, it might be
nicer to have the common library functionality in its own module that
can be imported into both sphinx and the standalone command.  That still
leaves open the question of where that module lives
(Documentation/sphinx perhaps?) and how the Python path gets set up
correctly...

jon

Re: [RFC v2 18/38] docs: sphinx/kernel_abi: use AbiParser directly

Posted by Mauro Carvalho Chehab 1 year ago

Em Tue, 28 Jan 2025 15:37:25 -0700
Jonathan Corbet <corbet@lwn.net> escreveu:

> Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes:
> 
> > Instead of running get_abi.py script, import AbiParser class and
> > handle messages directly there using an interactor. This shold save some
> > memory, as there's no need to exec python inside the Sphinx python
> > extension.
> >
> > Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > ---
> >  Documentation/sphinx/kernel_abi.py | 26 +++++++++++++++-----------
> >  scripts/get_abi.py                 |  2 +-
> >  2 files changed, 16 insertions(+), 12 deletions(-)
> >
> > diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py
> > index fc7500fad119..93d537d8cb6c 100644
> > --- a/Documentation/sphinx/kernel_abi.py
> > +++ b/Documentation/sphinx/kernel_abi.py
> > @@ -42,6 +42,11 @@ from docutils.parsers.rst import directives, Directive
> >  from sphinx.util.docutils import switch_source_input
> >  from sphinx.util import logging
> >  
> > +srctree = os.path.abspath(os.environ["srctree"])
> > +sys.path.insert(0, os.path.join(srctree, "scripts"))
> > +
> > +from get_abi import AbiParser  
> 
> I have to admit that I don't like this bit of messing around with the
> path.  And importing things out of scripts/ seems ... inelegant.
> 
> I take it you still want to be able to run get_abi.py standalone even
> after it's directly integrated into Sphinx?  

Yes, because calling it via command line provides:

1. a way to test the parser and check the results;
2. a search utility;
3. the undefined symbol verification.

Btw, if you look at the other Sphinx modules, they do exactly the same:
they execute code from scripts/. The only difference here is that,
instead of loading the a perl/python/shell engine and running the entire
script from there, it is importing just a class.

> In this case, it might be
> nicer to have the common library functionality in its own module that
> can be imported into both sphinx and the standalone command. 

This would be possible too: place the classes on a common lib dir and
then import it from both command line and Sphinx extensions.

If we're willing to do that, then perhaps we can have separate files
for each different class, as this could make it easier to maintain.

> That still
> leaves open the question of where that module lives
> (Documentation/sphinx perhaps?) and how the Python path gets set up
> correctly...

I guess the command line at scripts/ could use something like this to
get the library location (untested) and add to the import search PATH:

	import os

	python_lib_dir="some/location"

	scriptdir = os.path.dirname(os.path.realpath(__file__))

	sys.path.insert(0, os.path.join(srctree, f"../{python_lib_dir}"))

	from abi_parser import abiParser

Now, I'm not sure if the best location for python libraries would
be at Documentation/sphinx, as we may end needing other python
libraries with time and not all would be used by Sphinx.

In short: I would be more inclined to place them on 
a new lib directory (tools/lib? tools/py_lib? scripts/lib?).

See, with the content of this series, if we split files per each class,
it would mean 3 files:

  1. abi_parser.py, containing AbiParser class (plus ABI_DIR const);
     (this is the only class used by Documentation/sphinx extensions)
  2. abi_regex.py, containing AbiRegex class;
  3. abi_symbols.py, containing SystemSymbols class.

Now, if we're going on this direction, it may also make sense to split
the command line classes/functions into 4 (or 5 files) for argparse
argument definition and command run code. If we do that, it means that
other files will be stored somewhere:

  4. abi_cmd_rest.py: AbiRest and AbiValidate classes for the rest 
     and validate arguments (I would likely place both at the same file,
     as the code is similar - but it could also be split on two separate
     files);
  5. abi_cmd_search.py: AbiSearch - for the search arguments;
  6. abi_cmd_undefined.py: AbiUndefined - for the undocumented symbol check 
     arguments;

Finally, there is the one under scripts/:

  7. get_abi.py: with the main function

For (1), Documentation/sphinx could make sense, but (2) to (6) are
used only by the command line tool. Placing them at Documentation/ 
seems weird. Well, nothing prevents having them at scripts/, IMHO, things
would become more organized if we place the Python files with 0644
flags elsewhere.

Thanks,
Mauro

Re: [RFC v2 18/38] docs: sphinx/kernel_abi: use AbiParser directly

Posted by Mauro Carvalho Chehab 1 year ago

Hi Jon,

Em Wed, 29 Jan 2025 01:43:24 +0100
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> escreveu:

> Em Tue, 28 Jan 2025 15:37:25 -0700
> Jonathan Corbet <corbet@lwn.net> escreveu:
> 
> > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes:
> >   
> > > Instead of running get_abi.py script, import AbiParser class and
> > > handle messages directly there using an interactor. This shold save some
> > > memory, as there's no need to exec python inside the Sphinx python
> > > extension.
> > >
> > > Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > > ---
> > >  Documentation/sphinx/kernel_abi.py | 26 +++++++++++++++-----------
> > >  scripts/get_abi.py                 |  2 +-
> > >  2 files changed, 16 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py
> > > index fc7500fad119..93d537d8cb6c 100644
> > > --- a/Documentation/sphinx/kernel_abi.py
> > > +++ b/Documentation/sphinx/kernel_abi.py
> > > @@ -42,6 +42,11 @@ from docutils.parsers.rst import directives, Directive
> > >  from sphinx.util.docutils import switch_source_input
> > >  from sphinx.util import logging
> > >  
> > > +srctree = os.path.abspath(os.environ["srctree"])
> > > +sys.path.insert(0, os.path.join(srctree, "scripts"))
> > > +
> > > +from get_abi import AbiParser    
> > 
> > I have to admit that I don't like this bit of messing around with the
> > path.  And importing things out of scripts/ seems ... inelegant.
> > 
> > I take it you still want to be able to run get_abi.py standalone even
> > after it's directly integrated into Sphinx?    
> 
> Yes, because calling it via command line provides:
> 
> 1. a way to test the parser and check the results;
> 2. a search utility;
> 3. the undefined symbol verification.
> 
> Btw, if you look at the other Sphinx modules, they do exactly the same:
> they execute code from scripts/. The only difference here is that,
> instead of loading the a perl/python/shell engine and running the entire
> script from there, it is importing just a class.
> 
> > In this case, it might be
> > nicer to have the common library functionality in its own module that
> > can be imported into both sphinx and the standalone command.   
> 
> This would be possible too: place the classes on a common lib dir and
> then import it from both command line and Sphinx extensions.
> 
> If we're willing to do that, then perhaps we can have separate files
> for each different class, as this could make it easier to maintain.
> 
> > That still
> > leaves open the question of where that module lives
> > (Documentation/sphinx perhaps?) and how the Python path gets set up
> > correctly...  
> 
> I guess the command line at scripts/ could use something like this to
> get the library location (untested) and add to the import search PATH:
> 
> 	import os
> 
> 	python_lib_dir="some/location"
> 
> 	scriptdir = os.path.dirname(os.path.realpath(__file__))
> 
> 	sys.path.insert(0, os.path.join(srctree, f"../{python_lib_dir}"))
> 
> 	from abi_parser import abiParser
> 
> Now, I'm not sure if the best location for python libraries would
> be at Documentation/sphinx, as we may end needing other python
> libraries with time and not all would be used by Sphinx.
> 
> In short: I would be more inclined to place them on 
> a new lib directory (tools/lib? tools/py_lib? scripts/lib?).
> 
> See, with the content of this series, if we split files per each class,
> it would mean 3 files:
> 
>   1. abi_parser.py, containing AbiParser class (plus ABI_DIR const);
>      (this is the only class used by Documentation/sphinx extensions)
>   2. abi_regex.py, containing AbiRegex class;
>   3. abi_symbols.py, containing SystemSymbols class.
> 
> Now, if we're going on this direction, it may also make sense to split
> the command line classes/functions into 4 (or 5 files) for argparse
> argument definition and command run code. If we do that, it means that
> other files will be stored somewhere:
> 
>   4. abi_cmd_rest.py: AbiRest and AbiValidate classes for the rest 
>      and validate arguments (I would likely place both at the same file,
>      as the code is similar - but it could also be split on two separate
>      files);
>   5. abi_cmd_search.py: AbiSearch - for the search arguments;
>   6. abi_cmd_undefined.py: AbiUndefined - for the undocumented symbol check 
>      arguments;
> 
> Finally, there is the one under scripts/:
> 
>   7. get_abi.py: with the main function
> 
> For (1), Documentation/sphinx could make sense, but (2) to (6) are
> used only by the command line tool. Placing them at Documentation/ 
> seems weird. Well, nothing prevents having them at scripts/, IMHO, things
> would become more organized if we place the Python files with 0644
> flags elsewhere.

As I'll be preparing such patches for merge along this week, I'd
like to know what do you prefer in terms of directories:

1. Keep it as-is;
2. have a separate library directory for Python modules
   (scripts/lib?);
3. place python modules inside scripts/;
4. place python modules inside Documentation/sphinx (IMO a bad
   idea);
5. something else

-

Btw, I'm considering to also submit later a patchset similar to
this one converting kernel-doc to Python. I already started writing
something like that (written from the scratch, following as much
as possible what we have today on Perl to avoid regressions).

I would probably split the code into separate classes to make the code 
more readable/maintainable (a base class, a class with rest output,
another one with man output, and a few other helper classes).

Thanks,
Mauro

Re: [RFC v2 18/38] docs: sphinx/kernel_abi: use AbiParser directly

Posted by Jonathan Corbet 1 year ago

Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes:

> Hi Jon,

> As I'll be preparing such patches for merge along this week, I'd
> like to know what do you prefer in terms of directories:
>
> 1. Keep it as-is;
> 2. have a separate library directory for Python modules
>    (scripts/lib?);
> 3. place python modules inside scripts/;
> 4. place python modules inside Documentation/sphinx (IMO a bad
>    idea);
> 5. something else

Honestly, I'm not sure.  I do feel that importing out of scripts/ is
inelegant at best; having a dedicated directory for modules meant to be
imported would be better.  So maybe scripts/lib?  Or lib/python, though
that might raise eyebrows elsewhere, dunno.  Pick something you like,
and we'll give that a try.

> Btw, I'm considering to also submit later a patchset similar to
> this one converting kernel-doc to Python. I already started writing
> something like that (written from the scratch, following as much
> as possible what we have today on Perl to avoid regressions).
>
> I would probably split the code into separate classes to make the code 
> more readable/maintainable (a base class, a class with rest output,
> another one with man output, and a few other helper classes).

I definitely approve of the idea - I've pondered doing such a thing, but
have never come close to finding the time.  It's probably worth looking
at the rewrite Markus did years ago as a starting point?

Thanks,

jon

Re: [RFC v2 18/38] docs: sphinx/kernel_abi: use AbiParser directly

Posted by Mauro Carvalho Chehab 12 months ago

Em Tue, 04 Feb 2025 10:12:22 -0700
Jonathan Corbet <corbet@lwn.net> escreveu:

> Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes:
> 
> > Hi Jon,  
> 
> > As I'll be preparing such patches for merge along this week, I'd
> > like to know what do you prefer in terms of directories:
> >
> > 1. Keep it as-is;
> > 2. have a separate library directory for Python modules
> >    (scripts/lib?);
> > 3. place python modules inside scripts/;
> > 4. place python modules inside Documentation/sphinx (IMO a bad
> >    idea);
> > 5. something else  
> 
> Honestly, I'm not sure.  I do feel that importing out of scripts/ is
> inelegant at best; having a dedicated directory for modules meant to be
> imported would be better.  So maybe scripts/lib?  Or lib/python, though
> that might raise eyebrows elsewhere, dunno.  Pick something you like,
> and we'll give that a try.

I would go for scripts/lib then.

> 
> > Btw, I'm considering to also submit later a patchset similar to
> > this one converting kernel-doc to Python. I already started writing
> > something like that (written from the scratch, following as much
> > as possible what we have today on Perl to avoid regressions).
> >
> > I would probably split the code into separate classes to make the code 
> > more readable/maintainable (a base class, a class with rest output,
> > another one with man output, and a few other helper classes).  
> 
> I definitely approve of the idea - I've pondered doing such a thing, but
> have never come close to finding the time.  It's probably worth looking
> at the rewrite Markus did years ago as a starting point?

I took a look on Markus work: it was licensed under GPL 3.0 and it was
written in 2016. There were several changes on kerneldoc since them,
including the addition of a regex that it is not compatible with
Python re[1]:

        $members =~ s/\bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;/$2/gos;

This one use:

	- recursive patterns: ?1
	- atomic grouping (?>...)

Also, it is hard to map what he does with the existing script. I'm
opting to write a new script from scratch. I have already a version
that does ReST conversion (still missing man pages and filtering).
I'll send a RFC patch once I get it to the point it works properly.

[1] In the specific case of such regex, added on this commit:

	 50d7bd38c3aa ("stddef: Introduce struct_group() helper macro")

    I'm not sure why a very complex regex was used there. I'm currently 
    stuck on handling it properly in Python. An option would be do to:

	try:
		import regex as re
		has_regex_advanced_features = True
	except:
		import re
		has_regex_advanced_features = False

    And, if has_regex_advanced_features is false, use a simpler regex 
    that would work in Python or would just ignore struct_group
    expressions.

    Another option would be to re-implement such regexes without using
    such advanced patterns.

Thanks,
Mauro

Re: [RFC v2 18/38] docs: sphinx/kernel_abi: use AbiParser directly

Posted by Jonathan Corbet 12 months ago

Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes:

> I took a look on Markus work: it was licensed under GPL 3.0 and it was
> written in 2016. There were several changes on kerneldoc since them,
> including the addition of a regex that it is not compatible with
> Python re[1]:
>
>         $members =~ s/\bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;/$2/gos;
>
> This one use:
>
> 	- recursive patterns: ?1
> 	- atomic grouping (?>...)
>
> Also, it is hard to map what he does with the existing script. I'm
> opting to write a new script from scratch.

That's fine, I just wanted to be sure you'd had a chance to look at
it... 

>     Another option would be to re-implement such regexes without using
>     such advanced patterns.

Seems like a preferred option if that can be done.  Banging one's head
against all those regexes is often the hardest part of dealing with that
script; anything that makes it simpler is welcome.

Thanks,

jon

Re: [RFC v2 18/38] docs: sphinx/kernel_abi: use AbiParser directly

Posted by Mauro Carvalho Chehab 12 months ago

Em Mon, 10 Feb 2025 07:40:02 -0700
Jonathan Corbet <corbet@lwn.net> escreveu:

> Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes:
> 
> > I took a look on Markus work: it was licensed under GPL 3.0 and it was
> > written in 2016. There were several changes on kerneldoc since them,
> > including the addition of a regex that it is not compatible with
> > Python re[1]:
> >
> >         $members =~ s/\bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;/$2/gos;
> >
> > This one use:
> >
> > 	- recursive patterns: ?1
> > 	- atomic grouping (?>...)
> >
> > Also, it is hard to map what he does with the existing script. I'm
> > opting to write a new script from scratch.  
> 
> That's fine, I just wanted to be sure you'd had a chance to look at
> it... 
> 
> >     Another option would be to re-implement such regexes without using
> >     such advanced patterns.  
> 
> Seems like a preferred option if that can be done.  Banging one's head
> against all those regexes is often the hardest part of dealing with that
> script; anything that makes it simpler is welcome.

Agreed. This one, in special, is very hard for me to understand, as I
never used recursive patterns or atomic grouping. The net result of
the struct_group*() handling is that it removes some parameters when
generating the function prototype. This is done using a complex logic
on two steps:

       # unwrap struct_group():
        # - first eat non-declaration parameters and rewrite for final match
        # - then remove macro, outer parens, and trailing semicolon
        $members =~ s/\bstruct_group\s*\(([^,]*,)/STRUCT_GROUP(/gos;
        $members =~ s/\bstruct_group_attr\s*\(([^,]*,){2}/STRUCT_GROUP(/gos;
        $members =~ s/\bstruct_group_tagged\s*\(([^,]*),([^,]*),/struct $1 $2; STRUCT_GROUP(/gos;
        $members =~ s/\b__struct_group\s*\(([^,]*,){3}/STRUCT_GROUP(/gos;
        $members =~ s/\bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;/$2/gos;


The first step basically eliminates some members of the function. At the
places I checked, the second step was just removing parenthesis from the
macro (and the STRUCT_GROUP name).

I suspect that the same result could be done with a much simpler expression
like:

	$members =~ s/\bSTRUCT_GROUP\((.*)\)[^;]*;/$2/gos;

But maybe there are some corner cases that would cause such simpler
regex to fail.

-

On a side note, the "o" flag used there at kernel-doc is described
as[1]:

	"o  - pretend to optimize your code, but actually introduce bugs"

I wonder if we're reaching any issues on kernel docs due to that ;-)

[1] https://perldoc.perl.org/perlre::#Other-Modifiers

> 
> Thanks,
> 
> jon

Re: [RFC v2 18/38] docs: sphinx/kernel_abi: use AbiParser directly

Posted by Mauro Carvalho Chehab 11 months, 3 weeks ago

Em Mon, 10 Feb 2025 17:03:54 +0100
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> escreveu:

> Em Mon, 10 Feb 2025 07:40:02 -0700
> Jonathan Corbet <corbet@lwn.net> escreveu:
> 
> > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> writes:
> >   
> > > I took a look on Markus work: it was licensed under GPL 3.0 and it was
> > > written in 2016. There were several changes on kerneldoc since them,
> > > including the addition of a regex that it is not compatible with
> > > Python re[1]:
> > >
> > >         $members =~ s/\bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;/$2/gos;
> > >
> > > This one use:
> > >
> > > 	- recursive patterns: ?1
> > > 	- atomic grouping (?>...)
> > >
> > > Also, it is hard to map what he does with the existing script. I'm
> > > opting to write a new script from scratch.    
> > 
> > That's fine, I just wanted to be sure you'd had a chance to look at
> > it... 
> >   
> > >     Another option would be to re-implement such regexes without using
> > >     such advanced patterns.    
> > 
> > Seems like a preferred option if that can be done.  Banging one's head
> > against all those regexes is often the hardest part of dealing with that
> > script; anything that makes it simpler is welcome.  
> 
> Agreed. This one, in special, is very hard for me to understand, as I
> never used recursive patterns or atomic grouping. The net result of
> the struct_group*() handling is that it removes some parameters when
> generating the function prototype. This is done using a complex logic
> on two steps:
> 
>        # unwrap struct_group():
>         # - first eat non-declaration parameters and rewrite for final match
>         # - then remove macro, outer parens, and trailing semicolon
>         $members =~ s/\bstruct_group\s*\(([^,]*,)/STRUCT_GROUP(/gos;
>         $members =~ s/\bstruct_group_attr\s*\(([^,]*,){2}/STRUCT_GROUP(/gos;
>         $members =~ s/\bstruct_group_tagged\s*\(([^,]*),([^,]*),/struct $1 $2; STRUCT_GROUP(/gos;
>         $members =~ s/\b__struct_group\s*\(([^,]*,){3}/STRUCT_GROUP(/gos;
>         $members =~ s/\bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;/$2/gos;
> 
> 
> The first step basically eliminates some members of the function. At the
> places I checked, the second step was just removing parenthesis from the
> macro (and the STRUCT_GROUP name).
> 
> I suspect that the same result could be done with a much simpler expression
> like:
> 
> 	$members =~ s/\bSTRUCT_GROUP\((.*)\)[^;]*;/$2/gos;
> 
> But maybe there are some corner cases that would cause such simpler
> regex to fail.

A simpler regex didn't work.

Instead of using Python regex module and keep such complex regular
expressions, I opted to implement it on a completely different way.
See patch below.

Basically, I implemented a new class (NestedMatch) just to
handle patterns like:

	FOO(.*)

converting them into:

	.*

I expect that the new logic there would work properly for any
number of [], {} and () paired delimiters.

I ended writing such class from scratch, based on a suggestion from
stackoverflow.

Using such logic, replacing STRUCT_GROUP(.*) by .* is as simple
as:

        sub_nested_prefixes = [
            (re.compile(r'\bSTRUCT_GROUP'),  r'\1'),
        ]

        nested = NestedMatch()

        for search, sub in sub_nested_prefixes:
            members = nested.sub(search, sub, members)

This should probably help cleaning up other similar patterns.
Let's see. Anyway, I expect that this would make the C parsing
part easier to maintain.

For the first version, I'll use the new way only when I notice
discrepancies between Perl and Python versions.

Thanks,
Mauro

---

[PATCH] scripts/kernel-doc.py: properly handle struct_group macros

Handing nested parenthesis with regular expressions is not an
easy task. It is even harder with Python's re module, as it
has a limited subset of regular expressions, missing more
advanced features.

We might use instead Python regex module, but still the
regular expressions are very hard to understand. So, instead,
add a logic to properly match delimiters.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>

diff --git a/scripts/kernel-doc.py b/scripts/kernel-doc.py
index 9b10fd05b6af..df824692c4e7 100755
--- a/scripts/kernel-doc.py
+++ b/scripts/kernel-doc.py
@@ -92,6 +92,138 @@ class Re:
     def group(self, num):
         return self.last_match.group(num)
 
+class NestedMatch:
+    """
+    Finding nested delimiters is hard with regular expressions. It is
+    even harder on Python with its normal re module, as there are several
+    advanced regular expressions that are missing.
+
+    This is the case of this pattern:
+
+            '\bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;'
+
+    which is used to properly match open/close parenthesis of the
+    string search STRUCT_GROUP(),
+
+    Add a class that counts pairs of delimiters, using it to match and
+    replace nested expressions.
+
+    The original approach was suggested by:
+        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
+
+    Although I re-implemented it to make it more generic and match 3 types
+    of delimiters. The logic checks if delimiters are paired. If not, it
+    will ignore the search string.
+    """
+
+    DELIMITER_PAIRS = {
+        '{': '}',
+        '(': ')',
+        '[': ']',
+    }
+
+    RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
+
+    def _search(self, regex, line):
+        """
+        Finds paired blocks.
+
+        If a given regex is not empty, the logic will first seek for its
+        position, and then handles the next delimiters afterwards
+
+        The suggestion of using finditer to match pairs came from:
+        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
+        but I ended using a different implementation to align all three types
+        of delimiters and seek for an initial regular expression.
+
+        The algorithm seeks for open/close paired delimiters and place them
+        into a stack, yielding a start/stop position of each match  when the
+        stack is zeroed.
+
+        The algorithm shoud work fine for properly paired lines, but will
+        silently ignore end delimiters that preceeds an start delimiter.
+        This should be OK for kernel-doc parser, as unaligned delimiters
+        would cause compilation errors. So, we don't need to rise exceptions
+        to cover such issues.
+        """
+
+        stack = []
+
+        for match_re in regex.finditer(line):
+            offset = match_re.end()
+
+            for match in self.RE_DELIM.finditer(line[offset:]):
+                pos = match.start() + offset
+
+                d = line[pos]
+
+                if d in self.DELIMITER_PAIRS.keys():
+                    end = self.DELIMITER_PAIRS[d]
+
+                    stack.append(end)
+                    continue
+
+                # Does the end delimiter match what it is expected?
+                if stack and d == stack[-1]:
+                    stack.pop()
+
+                    if not stack:
+                        yield offset, pos + 1
+
+    def search(self, regex, line):
+        """
+        This is similar to re.search:
+
+        It matches a regex that it is followed by a delimiter,
+        returning occurrences only if all delimiters are paired.
+        """
+
+        for prev_pos, pos in self._search(regex, line):
+
+            yield line[prev_pos:pos]
+
+    def sub(self, regex, sub, line, count=0):
+        """
+        This is similar to re.sub:
+
+        It matches a regex that it is followed by a delimiter,
+        replacing occurrences only if all delimiters are paired.
+
+        if r'\1' is used, it works just like re: it places there the
+        matched paired data with the delimiter stripped.
+
+        If count is different than zero, it will replace at most count
+        items.
+        """
+        out = ""
+
+        cur_pos = 0
+        n = 0
+
+        for start, end in self._search(regex, line):
+            out += line[cur_pos:start]
+
+            # Value, ignoring start/end delimiters
+            value = line[start + 1:end - 1]
+
+            # replaces \1 at the sub string, if \1 is used there
+            new_sub = sub
+            new_sub = new_sub.replace(r'\1', value)
+
+            out += new_sub
+
+            cur_pos = end
+            n += 1
+
+            if count and count >= n:
+                break
+
+        # Append the remaining string
+        l = len(line)
+        out += line[cur_pos:l]
+
+        return out
+
 #
 # Regular expressions used to parse kernel-doc markups at KernelDoc class.
 #
@@ -667,19 +799,40 @@ class KernelDoc:
             # Unwrap struct_group() based on this definition:
             # __struct_group(TAG, NAME, ATTRS, MEMBERS...)
             # which has variants like: struct_group(NAME, MEMBERS...)
+            #
+            # Parsing them are done on two steps:
+            # 1. drop arguments that aren't struct names;
+            # 2. remove STRUCT_GROUP() ancillary macro.
+            #
+            # NOTE: the original logic which replaces STRUCT_GROUP(.*) by .*
+            # is incompatible with Python re, as it uses:
+            #
+            #   - a recursive pattern: (?1)
+            #   - an atomic grouping: (?>...)
+            #
+            # This is the original expression:
+            #   '\bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;'
+            #
+            # I tried a simpler version: but it didn't work either:
+            #   \bSTRUCT_GROUP\(([^\)]+)\)[^;]*;
+            #
+            # As it doesn't properly match the end parenthesis.
+            #
+            # So, a better solution was crafted: there's now a NestedMatch
+            # class that ensures that delimiters after a search are properly
+            # matched. So, the implementation to drop STRUCT_GROUP() will be
+            # handled in separate.
 
             (Re(r'\bstruct_group\s*\(([^,]*,)', re.S),  r'STRUCT_GROUP('),
             (Re(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S),  r'STRUCT_GROUP('),
             (Re(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S),  r'struct \1 \2; STRUCT_GROUP('),
             (Re(r'\b__struct_group\s*\(([^,]*,){3}', re.S),  r'STRUCT_GROUP('),
 
-            # This is incompatible with Python re, as it uses:
-            #  recursive patterns ((?1)) and atomic grouping ((?>...)):
-            #   '\bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;'
-            # Let's see if this works instead:
-            (Re(r'\bSTRUCT_GROUP\(([^\)]+)\)[^;]*;', re.S),  r'\1'),
-
             # Replace macros
+            #
+            # TODO: it is better to also move those to the NestedMatch logic,
+            # to endure that parenthesis will be properly matched.
+
             (Re(r'__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)', re.S),  r'DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)'),
             (Re(r'DECLARE_PHY_INTERFACE_MASK\s*\(([^\)]+)\)', re.S),  r'DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)'),
             (Re(r'DECLARE_BITMAP\s*\(' + args_pattern + r',\s*' + args_pattern + r'\)', re.S),  r'unsigned long \1[BITS_TO_LONGS(\2)]'),
@@ -691,9 +844,18 @@ class KernelDoc:
             (Re(r'DEFINE_DMA_UNMAP_LEN\s*\(' + args_pattern + r'\)', re.S),  r'__u32 \1'),
         ]
 
+        sub_nested_prefixes = [
+            (re.compile(r'\bSTRUCT_GROUP'),  r'\1'),
+        ]
+
         for search, sub in sub_prefixes:
             members = search.sub(sub, members)
 
+        nested = NestedMatch()
+
+        for search, sub in sub_nested_prefixes:
+            members = nested.sub(search, sub, members)
+
         # Keeps the original declaration as-is
         declaration = members