From nobody Tue Apr 7 16:17:37 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id DFBE73BE650; Thu, 12 Mar 2026 14:54:59 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773327300; cv=none; b=nVFZZF34PxZ1PrOidUpnpic730veLd9VJv1tFZgEU5fVWiUqHYA/+t9l1ZiMXD7Tgef2LE+ilhI7ud0evERpxjxr5Qxh2O4mXuIa07nsALm2qGT+/skB+JALbZXejWSrRbhmiFz19EhRXKDtMntTF0OlzeD590lmdako+pTSXDs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773327300; c=relaxed/simple; bh=23osqksC+HlDZjjQOvGmLu9UHdH5zA5IUH035QkOdVQ=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=X2MS4GeSyby7zVM/xSEk3KBmrar14LNZnEYtm6kiOBQ1MI6vBWtZjAWaO2DZNP4st6jDaVOCZeUFcF6wtFHmXcULLSILlhDyo4shuu0WEUPLlsFLZB7ZvLlaaGoRijfypnL5IuV+yztIaUXvXvueINmz9N82Ph5H6NwWyMc8s28= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=JC+hGW9p; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="JC+hGW9p" Received: by smtp.kernel.org (Postfix) with ESMTPSA id A073CC19424; Thu, 12 Mar 2026 14:54:59 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1773327299; bh=23osqksC+HlDZjjQOvGmLu9UHdH5zA5IUH035QkOdVQ=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=JC+hGW9pzNzwD20PFpPs0JX5fviNXqg2/grt7ZTBTdQi53KAEu8y1hoZPnvu6yjj7 nXXddCTiW0L21IBND/CidZkJpN0HIWAO74of4tcp2+2TbF1Akoa0vs7pWkUr+j+oUn SwN6d57bv9KQftH6KrX4KPHaY4ZxOmF7VXFhZtelsGYi0sNkeWJLreU1VaG66cF4m2 b0iHwUfz0NwzqdoWBY/wJ+yK+YE8pyEH59Ft81XDFwwqJmhnSGI+k8PW0f5FJWEZcH biGm0eBi2W7YpNLTFe3ThBkxEhVMSkYC4Pkq8wHne4r6KKw74Qe9P1yaHXJyqsjdqK 6FxrlzLZPXtuw== Received: from mchehab by mail.kernel.org with local (Exim 4.99.1) (envelope-from ) id 1w0hRB-00000008y1P-3luq; Thu, 12 Mar 2026 15:54:57 +0100 From: Mauro Carvalho Chehab To: Jonathan Corbet , Linux Doc Mailing List Cc: Mauro Carvalho Chehab , linux-hardening@vger.kernel.org, linux-kernel@vger.kernel.org Subject: [PATCH v2 13/28] docs: c_lex: properly implement a sub() method for CMatch Date: Thu, 12 Mar 2026 15:54:33 +0100 Message-ID: <4c88a55b21910b1bae6838d74c8ab6b32e5c8213.1773326442.git.mchehab+huawei@kernel.org> X-Mailer: git-send-email 2.52.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Sender: Mauro Carvalho Chehab Change the sub() method to do what it is expected, parsing backref arguments like \0, \1, \2, ... Signed-off-by: Mauro Carvalho Chehab --- tools/lib/python/kdoc/c_lex.py | 240 +++++++++++++++++++++++++++------ 1 file changed, 202 insertions(+), 38 deletions(-) diff --git a/tools/lib/python/kdoc/c_lex.py b/tools/lib/python/kdoc/c_lex.py index e986a4ad73e3..98031cb7907c 100644 --- a/tools/lib/python/kdoc/c_lex.py +++ b/tools/lib/python/kdoc/c_lex.py @@ -10,6 +10,8 @@ Those help caching regular expressions and do matching fo= r kernel-doc. =20 import re =20 +from copy import copy + from .kdoc_re import KernRe =20 class CToken(): @@ -36,6 +38,8 @@ class CToken(): NAME =3D 14 #: A name. Can be an ID or a type. SPACE =3D 15 #: Any space characters, including new lines =20 + BACKREF =3D 16 #: Not a valid C sequence, but used at sub regex patte= rns. + MISMATCH =3D 255 #: an error indicator: should never happen in practi= ce. =20 # Dict to convert from an enum interger into a string. @@ -107,6 +111,8 @@ TOKEN_LIST =3D [ =20 (CToken.SPACE, r"[\s]+"), =20 + (CToken.BACKREF, r"\\\d+"), + (CToken.MISMATCH,r"."), ] =20 @@ -245,6 +251,167 @@ class CTokenizer(): return out =20 =20 +class CTokenArgs: + """ + Ancillary class to help using backrefs from sub matches. + + If the highest backref contain a "+" at the last element, + the logic will be greedy, picking all other delims. + + This is needed to parse struct_group macros with end with ``MEMBERS...= ``. + """ + def __init__(self, sub_str): + self.sub_groups =3D set() + self.max_group =3D -1 + self.greedy =3D None + + for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str): + group =3D int(m.group(1)) + if m.group(2) =3D=3D "+": + if self.greedy and self.greedy !=3D group: + raise ValueError("There are multiple greedy patterns!") + self.greedy =3D group + + self.sub_groups.add(group) + self.max_group =3D max(self.max_group, group) + + if self.greedy: + if self.greedy !=3D self.max_group: + raise ValueError("Greedy pattern is not the last one!") + + sub_str =3D KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str) + + self.sub_str =3D sub_str + self.sub_tokeninzer =3D CTokenizer(sub_str) + + def groups(self, new_tokenizer): + """ + Create replacement arguments for backrefs like: + + ``\0``, ``\1``, ``\2``, ...``\n`` + + It also accepts a ``+`` character to the highest backref. When use= d, + it means in practice to ignore delimins after it, being greedy. + + The logic is smart enough to only go up to the maximum required + argument, even if there are more. + + If there is a backref for an argument above the limit, it will + raise an exception. Please notice that, on C, square brackets + don't have any separator on it. Trying to use ``\1``..``\n`` for + brackets also raise an exception. + """ + + level =3D (0, 0, 0) + + if self.max_group < 0: + return level, [] + + tokens =3D new_tokenizer.tokens + + # + # Fill \0 with the full token contents + # + groups_list =3D [ [] ] + + if 0 in self.sub_groups: + inner_level =3D 0 + + for i in range(0, len(tokens)): + tok =3D tokens[i] + + if tok.kind =3D=3D CToken.BEGIN: + inner_level +=3D 1 + continue + + if tok.kind =3D=3D CToken.END: + inner_level -=3D 1 + if inner_level < 0: + break + + if inner_level: + groups_list[0].append(tok) + + if not self.max_group: + return level, groups_list + + delim =3D None + + # + # Ignore everything before BEGIN. The value of begin gives the + # delimiter to be used for the matches + # + for i in range(0, len(tokens)): + tok =3D tokens[i] + if tok.kind =3D=3D CToken.BEGIN: + if tok.value =3D=3D "{": + delim =3D ";" + elif tok.value =3D=3D "(": + delim =3D "," + else: + raise ValueError(fr"Can't handle \1..\n on {sub_str}") + + level =3D tok.level + break + + pos =3D 1 + groups_list.append([]) + + inner_level =3D 0 + for i in range(i + 1, len(tokens)): + tok =3D tokens[i] + + if tok.kind =3D=3D CToken.BEGIN: + inner_level +=3D 1 + if tok.kind =3D=3D CToken.END: + inner_level -=3D 1 + if inner_level < 0: + break + + if tok.kind =3D=3D CToken.PUNC and delim =3D=3D tok.value: + pos +=3D 1 + if self.greedy and pos > self.max_group: + pos -=3D 1 + else: + groups_list.append([]) + + if pos > self.max_group: + break + + continue + + groups_list[pos].append(tok) + + if pos < self.max_group: + raise ValueError(fr"{self.sub_str} groups are up to {pos} inst= ead of {self.max_group}") + + return level, groups_list + + def tokens(self, new_tokenizer): + level, groups =3D self.groups(new_tokenizer) + + new =3D CTokenizer() + + for tok in self.sub_tokeninzer.tokens: + if tok.kind =3D=3D CToken.BACKREF: + group =3D int(tok.value[1:]) + + for group_tok in groups[group]: + new_tok =3D copy(group_tok) + + new_level =3D [0, 0, 0] + + for i in range(0, len(level)): + new_level[i] =3D new_tok.level[i] + level[i] + + new_tok.level =3D tuple(new_level) + + new.tokens +=3D [ new_tok ] + else: + new.tokens +=3D [ tok ] + + return new.tokens + class CMatch: """ Finding nested delimiters is hard with regular expressions. It is @@ -270,31 +437,9 @@ class CMatch: will ignore the search string. """ =20 - # TODO: make CMatch handle multiple match groups - # - # Right now, regular expressions to match it are defined only up to - # the start delimiter, e.g.: - # - # \bSTRUCT_GROUP\( - # - # is similar to: STRUCT_GROUP\((.*)\) - # except that the content inside the match group is delimiter-aligned. - # - # The content inside parentheses is converted into a single replace - # group (e.g. r`\0'). - # - # It would be nice to change such definition to support multiple - # match groups, allowing a regex equivalent to: - # - # FOO\((.*), (.*), (.*)\) - # - # it is probably easier to define it not as a regular expression, but - # with some lexical definition like: - # - # FOO(arg1, arg2, arg3) =20 def __init__(self, regex): - self.regex =3D KernRe(regex) + self.regex =3D KernRe("^" + regex + r"\b") =20 def _search(self, tokenizer): """ @@ -317,7 +462,6 @@ class CMatch: """ =20 start =3D None - offset =3D -1 started =3D False =20 import sys @@ -339,9 +483,8 @@ class CMatch: =20 if tok.kind =3D=3D CToken.END and tok.level =3D=3D stack[-1][1= ]: start, level =3D stack.pop() - offset =3D i =20 - yield CTokenizer(tokenizer.tokens[start:offset + 1]) + yield start, i start =3D None =20 # @@ -349,9 +492,9 @@ class CMatch: # This is meant to solve cases where the caller logic might be # picking an incomplete block. # - if start and offset < 0: + if start and stack: print("WARNING: can't find an end", file=3Dsys.stderr) - yield CTokenizer(tokenizer.tokens[start:]) + yield start, len(tokenizer.tokens) =20 def search(self, source): """ @@ -368,13 +511,15 @@ class CMatch: tokenizer =3D CTokenizer(source) is_token =3D False =20 - for new_tokenizer in self._search(tokenizer): + for start, end in self._search(tokenizer): + new_tokenizer =3D CTokenizer(tokenizer.tokens[start:end + 1]) + if is_token: yield new_tokenizer else: yield str(new_tokenizer) =20 - def sub(self, sub, line, count=3D0): + def sub(self, sub_str, source, count=3D0): """ This is similar to re.sub: =20 @@ -398,20 +543,39 @@ class CMatch: is_token =3D False tokenizer =3D CTokenizer(source) =20 + # Detect if sub_str contains sub arguments + + args_match =3D CTokenArgs(sub_str) + new_tokenizer =3D CTokenizer() - cur_pos =3D 0 + pos =3D 0 + n =3D 0 + + # + # NOTE: the code below doesn't consider overlays at sub. + # We may need to add some extra unit tests to check if those + # would cause problems. When replacing by "", this should not + # be a problem, but other transformations could be problematic + # for start, end in self._search(tokenizer): - new_tokenizer.tokens +=3D tokenizer.tokens[cur_pos:start] -# new_tokenizer.tokens +=3D [sub_str] + new_tokenizer.tokens +=3D tokenizer.tokens[pos:start] =20 - cur_pos =3D end + 1 + new =3D CTokenizer(tokenizer.tokens[start:end + 1]) =20 - if cur_pos: - new_tokenizer.tokens +=3D tokenizer.tokens[cur_pos:] + new_tokenizer.tokens +=3D args_match.tokens(new) =20 - print(new_tokenizer.tokens) + pos =3D end + 1 =20 - return str(new_tokenizer) + n +=3D 1 + if count and n >=3D count: + break + + new_tokenizer.tokens +=3D tokenizer.tokens[pos:] + + if not is_token: + return str(new_tokenizer) + + return new_tokenizer =20 def __repr__(self): """ --=20 2.52.0