From nobody Tue Apr 7 16:20:24 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A9C0E3B9DA4; Thu, 12 Mar 2026 14:54:59 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773327299; cv=none; b=mUs9tI1MV1RJEtTfpEtrBD+9QbaPE3eX2uYQAyUvaA7y+VMKwTQMsebGwGrxVWebQXu4G7t4YhyTWq879dansnRDVWzIqG1M97OyW2JpDpI+zod4VT9/EczpsddJtS4wh4OB8hUsiqdTtqWeUp2/LfHJ9N57XVnYhKNx/e20LQY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773327299; c=relaxed/simple; bh=DOreXVDvucIGBkwwT2ilLuF9eQSdozf07HLDbrhdpG0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=sgA5V5l0IHrvBljp19KbAk5VUccDrVHA+H02ESU54gTZnd5+nPe7ojI7omWb1+9DR6BvZE/4yDINKaEzWBrPgMrK+rB5pPscqSBYUIHr3zJDpu+RM3YhjPPbCaIj1y/0EPA/e03b9iCE5ZaVu+XrUQr/f7YK+ARs/5YsoU86FKc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=acEbsA4A; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="acEbsA4A" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 3B487C2BC86; Thu, 12 Mar 2026 14:54:59 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1773327299; bh=DOreXVDvucIGBkwwT2ilLuF9eQSdozf07HLDbrhdpG0=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=acEbsA4ADqePlM+In2i9lU1TFwupiPOzYZeITN2ACzHz2QhxAttkLog6ErKzN4Qwh faBy0vljf3LbzxWBlneyYcgWr8AskeiP3BoBD6xZUF4jMJbXHE8EdDrUuFeuulDwUC bFbqSnYktT1B9aNxf4NGyWpmfWWwdcHQlj2qNzJIBkUn238F3pYIANYZYSjaflYgcA OL2u0aI4Y07LukEpM3Jw6I2CZPJuGvjkCRRxiQbIdqL4ttjUfI8sxMvgJilqxl2H4L AJqcpsiY4iBUfaFfd1uiCbkw51VR5MkSHmYq+z/LezXmDNYrQNMnR5vEKJ/AFG5m+d njpL18fVKfXcw== Received: from mchehab by mail.kernel.org with local (Exim 4.99.1) (envelope-from ) id 1w0hRB-00000008xyZ-24rx; Thu, 12 Mar 2026 15:54:57 +0100 From: Mauro Carvalho Chehab To: Jonathan Corbet , Linux Doc Mailing List Cc: Mauro Carvalho Chehab , linux-hardening@vger.kernel.org, linux-kernel@vger.kernel.org Subject: [PATCH v2 11/28] docs: kdoc: create a CMatch to match nested C blocks Date: Thu, 12 Mar 2026 15:54:31 +0100 Message-ID: <06d21e2c38a313aec8f7c8a6df4674c41c47c23b.1773326442.git.mchehab+huawei@kernel.org> X-Mailer: git-send-email 2.52.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Sender: Mauro Carvalho Chehab The NextMatch code is complex, and will become even more complex if we add there support for arguments. Now that we have a tokenizer, we can use a better solution, easier to be understood. Yet, to improve performance, it is better to make it use a previously tokenized code, changing its ABI. So, reimplement NextMatch using the CTokener class. Once it is done, we can drop NestedMatch. Signed-off-by: Mauro Carvalho Chehab --- tools/lib/python/kdoc/c_lex.py | 222 +++++++++++++++++++++++++++--- tools/unittests/test_tokenizer.py | 3 +- 2 files changed, 203 insertions(+), 22 deletions(-) diff --git a/tools/lib/python/kdoc/c_lex.py b/tools/lib/python/kdoc/c_lex.py index 38f70e836eb8..e986a4ad73e3 100644 --- a/tools/lib/python/kdoc/c_lex.py +++ b/tools/lib/python/kdoc/c_lex.py @@ -58,14 +58,13 @@ class CToken(): =20 return CToken.MISMATCH =20 + def __init__(self, kind, value=3DNone, pos=3D0, brace_level=3D0, paren_level=3D0, bracket_level=3D0): self.kind =3D kind self.value =3D value self.pos =3D pos - self.brace_level =3D brace_level - self.paren_level =3D paren_level - self.bracket_level =3D bracket_level + self.level =3D (bracket_level, paren_level, brace_level) =20 def __repr__(self): name =3D self.to_name(self.kind) @@ -74,8 +73,7 @@ class CToken(): else: value =3D self.value =20 - return f"CToken({name}, {value}, {self.pos}, " \ - f"{self.brace_level}, {self.paren_level}, {self.bracket_lev= el})" + return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" =20 #: Tokens to parse C code. TOKEN_LIST =3D [ @@ -105,20 +103,30 @@ TOKEN_LIST =3D [ (CToken.ENUM, r"\benum\b"), (CToken.TYPEDEF, r"\bkinddef\b"), =20 - (CToken.NAME, r"[A-Za-z_][A-Za-z0-9_]*"), + (CToken.NAME, r"[A-Za-z_][A-Za-z0-9_]*"), =20 (CToken.SPACE, r"[\s]+"), =20 (CToken.MISMATCH,r"."), ] =20 +def fill_re_scanner(token_list): + """Ancillary routine to convert TOKEN_LIST into a finditer regex""" + re_tokens =3D [] + + for kind, pattern in token_list: + name =3D CToken.to_name(kind) + re_tokens.append(f"(?P<{name}>{pattern})") + + return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) + #: Handle C continuation lines. RE_CONT =3D KernRe(r"\\\n") =20 RE_COMMENT_START =3D KernRe(r'/\*\s*') =20 #: tokenizer regex. Will be filled at the first CTokenizer usage. -re_scanner =3D None +RE_SCANNER =3D fill_re_scanner(TOKEN_LIST) =20 class CTokenizer(): """ @@ -149,7 +157,7 @@ class CTokenizer(): paren_level =3D 0 bracket_level =3D 0 =20 - for match in re_scanner.finditer(source): + for match in RE_SCANNER.finditer(source): kind =3D CToken.from_name(match.lastgroup) pos =3D match.start() value =3D match.group() @@ -175,7 +183,7 @@ class CTokenizer(): yield CToken(kind, value, pos, brace_level, paren_level, bracket_level) =20 - def __init__(self, source): + def __init__(self, source=3DNone): """ Create a regular expression to handle TOKEN_LIST. =20 @@ -183,20 +191,18 @@ class CTokenizer(): (?P...) =20 in this particular case, it makes sense, as we can pick the name - when matching a code via re_scanner(). + when matching a code via RE_SCANNER. """ - global re_scanner - - if not re_scanner: - re_tokens =3D [] - - for kind, pattern in TOKEN_LIST: - name =3D CToken.to_name(kind) - re_tokens.append(f"(?P<{name}>{pattern})") - - re_scanner =3D KernRe("|".join(re_tokens), re.MULTILINE | re.D= OTALL) =20 self.tokens =3D [] + + if not source: + return + + if isinstance(source, list): + self.tokens =3D source + return + for tok in self._tokenize(source): self.tokens.append(tok) =20 @@ -237,3 +243,179 @@ class CTokenizer(): out +=3D str(tok.value) =20 return out + + +class CMatch: + """ + Finding nested delimiters is hard with regular expressions. It is + even harder on Python with its normal re module, as there are several + advanced regular expressions that are missing. + + This is the case of this pattern:: + + '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' + + which is used to properly match open/close parentheses of the + string search STRUCT_GROUP(), + + Add a class that counts pairs of delimiters, using it to match and + replace nested expressions. + + The original approach was suggested by: + + https://stackoverflow.com/questions/5454322/python-how-to-match-ne= sted-parentheses-with-regex + + Although I re-implemented it to make it more generic and match 3 types + of delimiters. The logic checks if delimiters are paired. If not, it + will ignore the search string. + """ + + # TODO: make CMatch handle multiple match groups + # + # Right now, regular expressions to match it are defined only up to + # the start delimiter, e.g.: + # + # \bSTRUCT_GROUP\( + # + # is similar to: STRUCT_GROUP\((.*)\) + # except that the content inside the match group is delimiter-aligned. + # + # The content inside parentheses is converted into a single replace + # group (e.g. r`\0'). + # + # It would be nice to change such definition to support multiple + # match groups, allowing a regex equivalent to: + # + # FOO\((.*), (.*), (.*)\) + # + # it is probably easier to define it not as a regular expression, but + # with some lexical definition like: + # + # FOO(arg1, arg2, arg3) + + def __init__(self, regex): + self.regex =3D KernRe(regex) + + def _search(self, tokenizer): + """ + Finds paired blocks for a regex that ends with a delimiter. + + The suggestion of using finditer to match pairs came from: + https://stackoverflow.com/questions/5454322/python-how-to-match-ne= sted-parentheses-with-regex + but I ended using a different implementation to align all three ty= pes + of delimiters and seek for an initial regular expression. + + The algorithm seeks for open/close paired delimiters and places th= em + into a stack, yielding a start/stop position of each match when the + stack is zeroed. + + The algorithm should work fine for properly paired lines, but will + silently ignore end delimiters that precede a start delimiter. + This should be OK for kernel-doc parser, as unaligned delimiters + would cause compilation errors. So, we don't need to raise excepti= ons + to cover such issues. + """ + + start =3D None + offset =3D -1 + started =3D False + + import sys + + stack =3D [] + + for i, tok in enumerate(tokenizer.tokens): + if start is None: + if tok.kind =3D=3D CToken.NAME and self.regex.match(tok.va= lue): + start =3D i + stack.append((start, tok.level)) + started =3D False + + continue + + if not started and tok.kind =3D=3D CToken.BEGIN: + started =3D True + continue + + if tok.kind =3D=3D CToken.END and tok.level =3D=3D stack[-1][1= ]: + start, level =3D stack.pop() + offset =3D i + + yield CTokenizer(tokenizer.tokens[start:offset + 1]) + start =3D None + + # + # If an END zeroing levels is not there, return remaining stuff + # This is meant to solve cases where the caller logic might be + # picking an incomplete block. + # + if start and offset < 0: + print("WARNING: can't find an end", file=3Dsys.stderr) + yield CTokenizer(tokenizer.tokens[start:]) + + def search(self, source): + """ + This is similar to re.search: + + It matches a regex that it is followed by a delimiter, + returning occurrences only if all delimiters are paired. + """ + + if isinstance(source, CTokenizer): + tokenizer =3D source + is_token =3D True + else: + tokenizer =3D CTokenizer(source) + is_token =3D False + + for new_tokenizer in self._search(tokenizer): + if is_token: + yield new_tokenizer + else: + yield str(new_tokenizer) + + def sub(self, sub, line, count=3D0): + """ + This is similar to re.sub: + + It matches a regex that it is followed by a delimiter, + replacing occurrences only if all delimiters are paired. + + if the sub argument contains:: + + r'\0' + + it will work just like re: it places there the matched paired data + with the delimiter stripped. + + If count is different than zero, it will replace at most count + items. + """ + if isinstance(source, CTokenizer): + is_token =3D True + tokenizer =3D source + else: + is_token =3D False + tokenizer =3D CTokenizer(source) + + new_tokenizer =3D CTokenizer() + cur_pos =3D 0 + for start, end in self._search(tokenizer): + new_tokenizer.tokens +=3D tokenizer.tokens[cur_pos:start] +# new_tokenizer.tokens +=3D [sub_str] + + cur_pos =3D end + 1 + + if cur_pos: + new_tokenizer.tokens +=3D tokenizer.tokens[cur_pos:] + + print(new_tokenizer.tokens) + + return str(new_tokenizer) + + def __repr__(self): + """ + Returns a displayable version of the class init. + """ + + return f'CMatch("{self.regex.regex.pattern}")' diff --git a/tools/unittests/test_tokenizer.py b/tools/unittests/test_token= izer.py index efb1d1687811..3081f27a7786 100755 --- a/tools/unittests/test_tokenizer.py +++ b/tools/unittests/test_tokenizer.py @@ -30,8 +30,7 @@ def tokens_to_list(tokens): if tok.kind =3D=3D CToken.SPACE: continue =20 - tuples +=3D [(tok.kind, tok.value, - tok.brace_level, tok.paren_level, tok.bracket_level)] + tuples +=3D [(tok.kind, tok.value, tok.level)] =20 return tuples =20 --=20 2.52.0