From nobody Tue Apr  7 16:20:24 2026
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id A9C0E3B9DA4;
	Thu, 12 Mar 2026 14:54:59 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1773327299; cv=none;
 b=mUs9tI1MV1RJEtTfpEtrBD+9QbaPE3eX2uYQAyUvaA7y+VMKwTQMsebGwGrxVWebQXu4G7t4YhyTWq879dansnRDVWzIqG1M97OyW2JpDpI+zod4VT9/EczpsddJtS4wh4OB8hUsiqdTtqWeUp2/LfHJ9N57XVnYhKNx/e20LQY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1773327299; c=relaxed/simple;
	bh=DOreXVDvucIGBkwwT2ilLuF9eQSdozf07HLDbrhdpG0=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=sgA5V5l0IHrvBljp19KbAk5VUccDrVHA+H02ESU54gTZnd5+nPe7ojI7omWb1+9DR6BvZE/4yDINKaEzWBrPgMrK+rB5pPscqSBYUIHr3zJDpu+RM3YhjPPbCaIj1y/0EPA/e03b9iCE5ZaVu+XrUQr/f7YK+ARs/5YsoU86FKc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=acEbsA4A; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="acEbsA4A"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 3B487C2BC86;
	Thu, 12 Mar 2026 14:54:59 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1773327299;
	bh=DOreXVDvucIGBkwwT2ilLuF9eQSdozf07HLDbrhdpG0=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
	b=acEbsA4ADqePlM+In2i9lU1TFwupiPOzYZeITN2ACzHz2QhxAttkLog6ErKzN4Qwh
	 faBy0vljf3LbzxWBlneyYcgWr8AskeiP3BoBD6xZUF4jMJbXHE8EdDrUuFeuulDwUC
	 bFbqSnYktT1B9aNxf4NGyWpmfWWwdcHQlj2qNzJIBkUn238F3pYIANYZYSjaflYgcA
	 OL2u0aI4Y07LukEpM3Jw6I2CZPJuGvjkCRRxiQbIdqL4ttjUfI8sxMvgJilqxl2H4L
	 AJqcpsiY4iBUfaFfd1uiCbkw51VR5MkSHmYq+z/LezXmDNYrQNMnR5vEKJ/AFG5m+d
	 njpL18fVKfXcw==
Received: from mchehab by mail.kernel.org with local (Exim 4.99.1)
	(envelope-from <mchehab+huawei@kernel.org>)
	id 1w0hRB-00000008xyZ-24rx;
	Thu, 12 Mar 2026 15:54:57 +0100
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
To: Jonathan Corbet <corbet@lwn.net>,
	Linux Doc Mailing List <linux-doc@vger.kernel.org>
Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>,
	linux-hardening@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH v2 11/28] docs: kdoc: create a CMatch to match nested C blocks
Date: Thu, 12 Mar 2026 15:54:31 +0100
Message-ID: 
 <06d21e2c38a313aec8f7c8a6df4674c41c47c23b.1773326442.git.mchehab+huawei@kernel.org>
X-Mailer: git-send-email 2.52.0
In-Reply-To: <cover.1773326442.git.mchehab+huawei@kernel.org>
References: <cover.1773326442.git.mchehab+huawei@kernel.org>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Sender: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>

The NextMatch code is complex, and will become even more complex
if we add there support for arguments.

Now that we have a tokenizer, we can use a better solution,
easier to be understood.

Yet, to improve performance, it is better to make it use a
previously tokenized code, changing its ABI.

So, reimplement NextMatch using the CTokener class. Once it
is done, we can drop NestedMatch.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/c_lex.py    | 222 +++++++++++++++++++++++++++---
 tools/unittests/test_tokenizer.py |   3 +-
 2 files changed, 203 insertions(+), 22 deletions(-)

diff --git a/tools/lib/python/kdoc/c_lex.py b/tools/lib/python/kdoc/c_lex.py
index 38f70e836eb8..e986a4ad73e3 100644
--- a/tools/lib/python/kdoc/c_lex.py
+++ b/tools/lib/python/kdoc/c_lex.py
@@ -58,14 +58,13 @@ class CToken():
=20
         return CToken.MISMATCH
=20
+
     def __init__(self, kind, value=3DNone, pos=3D0,
                  brace_level=3D0, paren_level=3D0, bracket_level=3D0):
         self.kind =3D kind
         self.value =3D value
         self.pos =3D pos
-        self.brace_level =3D brace_level
-        self.paren_level =3D paren_level
-        self.bracket_level =3D bracket_level
+        self.level =3D (bracket_level, paren_level, brace_level)
=20
     def __repr__(self):
         name =3D self.to_name(self.kind)
@@ -74,8 +73,7 @@ class CToken():
         else:
             value =3D self.value
=20
-        return f"CToken({name}, {value}, {self.pos}, " \
-               f"{self.brace_level}, {self.paren_level}, {self.bracket_lev=
el})"
+        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
=20
 #: Tokens to parse C code.
 TOKEN_LIST =3D [
@@ -105,20 +103,30 @@ TOKEN_LIST =3D [
     (CToken.ENUM,    r"\benum\b"),
     (CToken.TYPEDEF, r"\bkinddef\b"),
=20
-    (CToken.NAME,      r"[A-Za-z_][A-Za-z0-9_]*"),
+    (CToken.NAME,    r"[A-Za-z_][A-Za-z0-9_]*"),
=20
     (CToken.SPACE,   r"[\s]+"),
=20
     (CToken.MISMATCH,r"."),
 ]
=20
+def fill_re_scanner(token_list):
+    """Ancillary routine to convert TOKEN_LIST into a finditer regex"""
+    re_tokens =3D []
+
+    for kind, pattern in token_list:
+        name =3D CToken.to_name(kind)
+        re_tokens.append(f"(?P<{name}>{pattern})")
+
+    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
+
 #: Handle C continuation lines.
 RE_CONT =3D KernRe(r"\\\n")
=20
 RE_COMMENT_START =3D KernRe(r'/\*\s*')
=20
 #: tokenizer regex. Will be filled at the first CTokenizer usage.
-re_scanner =3D None
+RE_SCANNER =3D fill_re_scanner(TOKEN_LIST)
=20
 class CTokenizer():
     """
@@ -149,7 +157,7 @@ class CTokenizer():
         paren_level =3D 0
         bracket_level =3D 0
=20
-        for match in re_scanner.finditer(source):
+        for match in RE_SCANNER.finditer(source):
             kind =3D CToken.from_name(match.lastgroup)
             pos =3D match.start()
             value =3D match.group()
@@ -175,7 +183,7 @@ class CTokenizer():
             yield CToken(kind, value, pos,
                          brace_level, paren_level, bracket_level)
=20
-    def __init__(self, source):
+    def __init__(self, source=3DNone):
         """
         Create a regular expression to handle TOKEN_LIST.
=20
@@ -183,20 +191,18 @@ class CTokenizer():
             (?P<name>...)
=20
         in this particular case, it makes sense, as we can pick the name
-        when matching a code via re_scanner().
+        when matching a code via RE_SCANNER.
         """
-        global re_scanner
-
-        if not re_scanner:
-            re_tokens =3D []
-
-            for kind, pattern in TOKEN_LIST:
-                name =3D CToken.to_name(kind)
-                re_tokens.append(f"(?P<{name}>{pattern})")
-
-            re_scanner =3D KernRe("|".join(re_tokens), re.MULTILINE | re.D=
OTALL)
=20
         self.tokens =3D []
+
+        if not source:
+            return
+
+        if isinstance(source, list):
+            self.tokens =3D source
+            return
+
         for tok in self._tokenize(source):
             self.tokens.append(tok)
=20
@@ -237,3 +243,179 @@ class CTokenizer():
                     out +=3D str(tok.value)
=20
         return out
+
+
+class CMatch:
+    """
+    Finding nested delimiters is hard with regular expressions. It is
+    even harder on Python with its normal re module, as there are several
+    advanced regular expressions that are missing.
+
+    This is the case of this pattern::
+
+            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
+
+    which is used to properly match open/close parentheses of the
+    string search STRUCT_GROUP(),
+
+    Add a class that counts pairs of delimiters, using it to match and
+    replace nested expressions.
+
+    The original approach was suggested by:
+
+        https://stackoverflow.com/questions/5454322/python-how-to-match-ne=
sted-parentheses-with-regex
+
+    Although I re-implemented it to make it more generic and match 3 types
+    of delimiters. The logic checks if delimiters are paired. If not, it
+    will ignore the search string.
+    """
+
+    # TODO: make CMatch handle multiple match groups
+    #
+    # Right now, regular expressions to match it are defined only up to
+    #       the start delimiter, e.g.:
+    #
+    #       \bSTRUCT_GROUP\(
+    #
+    # is similar to: STRUCT_GROUP\((.*)\)
+    # except that the content inside the match group is delimiter-aligned.
+    #
+    # The content inside parentheses is converted into a single replace
+    # group (e.g. r`\0').
+    #
+    # It would be nice to change such definition to support multiple
+    # match groups, allowing a regex equivalent to:
+    #
+    #   FOO\((.*), (.*), (.*)\)
+    #
+    # it is probably easier to define it not as a regular expression, but
+    # with some lexical definition like:
+    #
+    #   FOO(arg1, arg2, arg3)
+
+    def __init__(self, regex):
+        self.regex =3D KernRe(regex)
+
+    def _search(self, tokenizer):
+        """
+        Finds paired blocks for a regex that ends with a delimiter.
+
+        The suggestion of using finditer to match pairs came from:
+        https://stackoverflow.com/questions/5454322/python-how-to-match-ne=
sted-parentheses-with-regex
+        but I ended using a different implementation to align all three ty=
pes
+        of delimiters and seek for an initial regular expression.
+
+        The algorithm seeks for open/close paired delimiters and places th=
em
+        into a stack, yielding a start/stop position of each match when the
+        stack is zeroed.
+
+        The algorithm should work fine for properly paired lines, but will
+        silently ignore end delimiters that precede a start delimiter.
+        This should be OK for kernel-doc parser, as unaligned delimiters
+        would cause compilation errors. So, we don't need to raise excepti=
ons
+        to cover such issues.
+        """
+
+        start =3D None
+        offset =3D -1
+        started =3D False
+
+        import sys
+
+        stack =3D []
+
+        for i, tok in enumerate(tokenizer.tokens):
+            if start is None:
+                if tok.kind =3D=3D CToken.NAME and self.regex.match(tok.va=
lue):
+                    start =3D i
+                    stack.append((start, tok.level))
+                    started =3D False
+
+                continue
+
+            if not started and tok.kind =3D=3D CToken.BEGIN:
+                started =3D True
+                continue
+
+            if tok.kind =3D=3D CToken.END and tok.level =3D=3D stack[-1][1=
]:
+                start, level =3D stack.pop()
+                offset =3D i
+
+                yield CTokenizer(tokenizer.tokens[start:offset + 1])
+                start =3D None
+
+        #
+        # If an END zeroing levels is not there, return remaining stuff
+        # This is meant to solve cases where the caller logic might be
+        # picking an incomplete block.
+        #
+        if start and offset < 0:
+            print("WARNING: can't find an end", file=3Dsys.stderr)
+            yield CTokenizer(tokenizer.tokens[start:])
+
+    def search(self, source):
+        """
+        This is similar to re.search:
+
+        It matches a regex that it is followed by a delimiter,
+        returning occurrences only if all delimiters are paired.
+        """
+
+        if isinstance(source, CTokenizer):
+            tokenizer =3D source
+            is_token =3D True
+        else:
+            tokenizer =3D CTokenizer(source)
+            is_token =3D False
+
+        for new_tokenizer in self._search(tokenizer):
+            if is_token:
+                yield new_tokenizer
+            else:
+                yield str(new_tokenizer)
+
+    def sub(self, sub, line, count=3D0):
+        """
+        This is similar to re.sub:
+
+        It matches a regex that it is followed by a delimiter,
+        replacing occurrences only if all delimiters are paired.
+
+        if the sub argument contains::
+
+            r'\0'
+
+        it will work just like re: it places there the matched paired data
+        with the delimiter stripped.
+
+        If count is different than zero, it will replace at most count
+        items.
+        """
+        if isinstance(source, CTokenizer):
+            is_token =3D True
+            tokenizer =3D source
+        else:
+            is_token =3D False
+            tokenizer =3D CTokenizer(source)
+
+        new_tokenizer =3D CTokenizer()
+        cur_pos =3D 0
+        for start, end in self._search(tokenizer):
+            new_tokenizer.tokens +=3D tokenizer.tokens[cur_pos:start]
+#            new_tokenizer.tokens +=3D [sub_str]
+
+            cur_pos =3D end + 1
+
+        if cur_pos:
+            new_tokenizer.tokens +=3D tokenizer.tokens[cur_pos:]
+
+        print(new_tokenizer.tokens)
+
+        return str(new_tokenizer)
+
+    def __repr__(self):
+        """
+        Returns a displayable version of the class init.
+        """
+
+        return f'CMatch("{self.regex.regex.pattern}")'
diff --git a/tools/unittests/test_tokenizer.py b/tools/unittests/test_token=
izer.py
index efb1d1687811..3081f27a7786 100755
--- a/tools/unittests/test_tokenizer.py
+++ b/tools/unittests/test_tokenizer.py
@@ -30,8 +30,7 @@ def tokens_to_list(tokens):
         if tok.kind =3D=3D CToken.SPACE:
             continue
=20
-        tuples +=3D [(tok.kind, tok.value,
-                    tok.brace_level, tok.paren_level, tok.bracket_level)]
+        tuples +=3D [(tok.kind, tok.value, tok.level)]
=20
     return tuples
=20
--=20
2.52.0