From nobody Sat Oct 4 14:33:03 2025 Received: from smtpbgau1.qq.com (smtpbgau1.qq.com [54.206.16.166]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9A60442056 for ; Fri, 15 Aug 2025 07:55:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=54.206.16.166 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1755244556; cv=none; b=Vt0Jns1XlmIaArvcW/wSzCUdkNP+WDhGBYIwn/InBJiPC41imKklAlE0xx6XwhfGy3aWRqwn0sr9caPvtN4TqMoQjiLbieVzT5RGGggI7zP9W5ekUlLEc+J8BwaMr1z5kMlKRbNBSUnaxAEn/CCIaD7kKhnhULBcirZcF0hy0i0= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1755244556; c=relaxed/simple; bh=FKp9qrGyxPb1wOmh93bDMiSLybMwM1BJkxyNYFEjBeY=; h=From:To:Cc:Subject:Date:Message-Id:MIME-Version:Content-Type; b=nzMDMAemBF7KdUMvzuVfFd0FUtPVCKwg1rs1fBdTZ4HM8TyosMCauCsH70H7WK/03fe/qQGsd00Y3cmNBs3iexaNTXRqQ5ucwdCrNptPg1fKLxslXlNSiIOOzbySZDPKTXWAtADXPmxQlTYjkSwzq8QLoZMy0eU/5iSskWdUMNM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=uniontech.com; spf=pass smtp.mailfrom=uniontech.com; dkim=pass (1024-bit key) header.d=uniontech.com header.i=@uniontech.com header.b=ouF1keK8; arc=none smtp.client-ip=54.206.16.166 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=uniontech.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=uniontech.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=uniontech.com header.i=@uniontech.com header.b="ouF1keK8" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=uniontech.com; s=onoh2408; t=1755244521; bh=8AjUIk/Zqh55PDDXuVmj0hJvuhDpc6oxkZ4dfmxlBmY=; h=From:To:Subject:Date:Message-Id:MIME-Version; b=ouF1keK8rbO9jv5Gq1DQdrtvEt72Pff7glk+oZ5Hx3sS1OJ4m4VasbQvVSwjTrOcF 1/wjc5753oud80td8D61rCpTfbdFxId6y34E/x+RppBaQV6Bdq41WfmOCM5GNN2qdT mUWtbcz26M4teP/3dtg7EYWTofMpFcX5R2Uz34KE= X-QQ-mid: zesmtpip2t1755244478t1e0948ec X-QQ-Originating-IP: Cndmvw05UFEIzX8SCpYCEcyesNxf1IA6K4fsu7lRJvE= Received: from localhost.localdomain ( [localhost]) by bizesmtp.qq.com (ESMTP) with id ; Fri, 15 Aug 2025 15:54:37 +0800 (CST) X-QQ-SSF: 0000000000000000000000000000000 X-QQ-GoodBg: 1 X-BIZMAIL-ID: 8067457830094247552 EX-QQ-RecipientCnt: 9 From: Morduang Zang To: apw@canonical.com, joe@perches.com, dwaipayanray1@gmail.com, lukas.bulwahn@gmail.com Cc: linux-kernel@vger.kernel.org, wangyuli@uniontech.com, zhanjun@uniontech.com, niecheng1@uniontech.com, Morduang Zang Subject: [PATCH RESEND] Add comprehensive detection and automatic fixing capability for full-width (Unicode) characters that are commonly mistaken for ASCII punctuation marks. This helps catch input method editor artifacts that can cause compilation errors or formatting issues. Date: Fri, 15 Aug 2025 15:54:30 +0800 Message-Id: <20250815075430.135527-1-zhangdandan@uniontech.com> X-Mailer: git-send-email 2.20.1 Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-QQ-SENDSIZE: 520 Feedback-ID: zesmtpip:uniontech.com:qybglogicsvrgz:qybglogicsvrgz6b-0 X-QQ-XMAILINFO: NzSDQCFWCzVTUyidjX0TOekERRNf8zpTRvzbrdn5mOqZCNSQrIjUJ4+F P9TgbALhjgOL4ogNHDB933CmQazLPipq9CEUJ1rf4+FuXfWY7SR/gTLIVx1xdeQjcI6jSHM tcqnzcWCpEWUgaZNpyawO7azfXoZ0khx/6KGYidd5tG2BE3j07adhbJlY/CJQsU130Nw5aW UgqRm5C0k8SKOoKtS3qdPvz54lKes+JsLr32qeTDiL89fbFiWq28+Jfsk6QOzZ1847EdHq9 u6TfYTeji2v09NReGe8zzzb+ndijq1zD18TLZBkdUEHC3GvKlc4z6yUrmcNZszQ/bsBB0pY WOGpxNbaAjFUiYo+i0bnKTzt54CDOlVl/9pm3NzZAwIB+JtszDvR2PWehqiPk2xyVWY93UB f9pF0k2k5eZFNitw4ftoPeKcP2rSIPcS9+q2AERFMb/0kXtWxpe8stYN4wn0+Ccf8PJL/P8 abBlgGnr3995acUbxWz1+MAU4QY1VIy2RacHBeGxUiAMCYbzfv882X53tEOuAfk7ELEI7Xb B3Ze2f2ANQcbskFvkiJv6u0nEZl3/iqNIPUnON10OR9J2+j2F6yVYQ7kUQeeUqwP9zEZ6qG HCCmuIO92pYtCp6pMAwyf/CUMZaEjED94THPHwu3vVUMBuVscomb94qrtY+vnVBrYN4pT4a nmWK3DI72I6srpIp2OfO40mq+RYi0ERdfgW142MChhW1oiQyPiRsQsd76qCaTXCjNyRHNdZ Rdv4ur+FdQzfAwRw4+KqPvt5moRjsYcw5yCVi0wrtpDW4I0eZGv9grn8+2ZASoQ3tcyRlhT eKywlDB9s4nZRtMNfwL3PmPiqyoFOe5OIZxEiBCrc7hJJYnbGOQJt3094qOICXxcgCLyV0l 7R8BvvCV7x9buQVD5UJGuLffiCfBK9gpJ4HzVZiLgDKrNTx0EaslT4+eCW/BXxmOWJtR2AD Q+gkAYRF0+pknGyWp99UhUnL2aekYuLLum6P2BU6IRz/7i/Ry5vlyEa/pb5zEQbKZLSMAsW qX4lAaIhJSEQ7MjHX/TAi4kkKkZUmzF2lF9qQdFD3tVGOKntuE X-QQ-XMRINFO: NyFYKkN4Ny6FSmKK/uo/jdU= X-QQ-RECHKSPAM: 0 The implementation detects 25 types of full-width characters: - Basic punctuation: =EF=BC=9B=EF=BC=8C=E3=80=82=EF=BC=88=EF=BC=89=EF=BC=81= =EF=BC=9F=EF=BC=9A=E3=80=80 - Programming brackets: =EF=BC=BB=EF=BC=BD=EF=BD=9B=EF=BD=9D=EF=BC=9C=EF=BC= =9E - Assignment and comparison: =EF=BC=9D - Arithmetic operators: =EF=BC=8B=EF=BC=8D=EF=BC=8A=EF=BC=8F=EF=BC=BC - Other programming symbols: =EF=BC=85=EF=BC=83=EF=BC=86=EF=BD=9C Detection covers three areas: 1. Code lines (lines starting with '+') - FULLWIDTH_CHARS 2. Commit messages - FULLWIDTH_CHARS_COMMIT 3. Subject lines - FULLWIDTH_CHARS_SUBJECT Example usage: ./scripts/checkpatch.pl my_patch.patch ./scripts/checkpatch.pl --fix my_patch.patch ./scripts/checkpatch.pl --fix-inplace my_source.c Signed-off-by: Morduang Zang Signed-off-by: Wangyuli --- scripts/checkpatch.pl | 84 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e722dd6fa8ef..f4cb547a470b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -75,6 +75,41 @@ my $git_command =3D'export LANGUAGE=3Den_US.UTF-8; git'; my $tabsize =3D 8; my ${CONFIG_} =3D "CONFIG_"; =20 +# Full-width character mappings (UTF-8 byte sequences to ASCII) +my %fullwidth_chars =3D ( + # Basic punctuation + "\xef\xbc\x9b" =3D> [";", "semicolon", "=EF=BC=9B"], + "\xef\xbc\x8c" =3D> [",", "comma", "=EF=BC=8C"], + "\xe3\x80\x82" =3D> [".", "period", "=E3=80=82"], + "\xef\xbc\x88" =3D> ["(", "opening parenthesis", "=EF=BC=88"], + "\xef\xbc\x89" =3D> [")", "closing parenthesis", "=EF=BC=89"], + "\xef\xbc\x81" =3D> ["!", "exclamation mark", "=EF=BC=81"], + "\xef\xbc\x9f" =3D> ["?", "question mark", "=EF=BC=9F"], + "\xef\xbc\x9a" =3D> [":", "colon", "=EF=BC=9A"], + "\xe3\x80\x80" =3D> [" ", "space", "=E3=80=80"], + # Programming brackets + "\xef\xbc\xbb" =3D> ["[", "left square bracket", "=EF=BC=BB"], + "\xef\xbc\xbd" =3D> ["]", "right square bracket", "=EF=BC=BD"], + "\xef\xbd\x9b" =3D> ["{", "left curly bracket", "=EF=BD=9B"], + "\xef\xbd\x9d" =3D> ["}", "right curly bracket", "=EF=BD=9D"], + "\xef\xbc\x9c" =3D> ["<", "less-than sign", "=EF=BC=9C"], + "\xef\xbc\x9e" =3D> [">", "greater-than sign", "=EF=BC=9E"], + # Assignment and comparison + "\xef\xbc\x9d" =3D> ["=3D", "equals sign", "=EF=BC=9D"], + # Arithmetic operators + "\xef\xbc\x8b" =3D> ["+", "plus sign", "=EF=BC=8B"], + "\xef\xbc\x8d" =3D> ["-", "minus sign", "=EF=BC=8D"], + "\xef\xbc\x8a" =3D> ["*", "asterisk", "=EF=BC=8A"], + "\xef\xbc\x8f" =3D> ["/", "solidus", "=EF=BC=8F"], + "\xef\xbc\xbc" =3D> ["\\", "reverse solidus", "=EF=BC=BC"], + # Other programming symbols + "\xef\xbc\x85" =3D> ["%", "percent sign", "=EF=BC=85"], + "\xef\xbc\x83" =3D> ["#", "number sign", "=EF=BC=83"], + "\xef\xbc\x86" =3D> ["&", "ampersand", "=EF=BC=86"], + "\xef\xbd\x9c" =3D> ["|", "vertical line", "=EF=BD=9C"], +); +my $fullwidth_pattern =3D join('|', map { quotemeta($_) } keys %fullwidth_= chars); + my %maybe_linker_symbol; # for externs in c exceptions, when seen in *vmli= nux.lds.h =20 sub help { @@ -1019,6 +1054,40 @@ sub read_words { return 0; } =20 +# Check for full-width characters and optionally fix them +sub check_fullwidth_chars { + my ($line, $context, $warning_type, $apply_fix, $fixlinenr, $fixed_ref, $= herecurr) =3D @_; + my @found_chars =3D (); + my $fixed_line =3D $line; + my $has_fixes =3D 0; + + return 0 unless $line =3D~ /$fullwidth_pattern/o; + + if ($apply_fix) { + $fixed_line =3D~ s/($fullwidth_pattern)/$fullwidth_chars{$1}[0]/ge; + $has_fixes =3D ($fixed_line ne $line); + } + + while ($line =3D~ /($fullwidth_pattern)/go) { + my $fullwidth_byte_seq =3D $1; + if (exists $fullwidth_chars{$fullwidth_byte_seq}) { + my ($ascii_char, $name, $fullwidth_char) =3D @{$fullwidth_chars{$fullwi= dth_byte_seq}}; + push @found_chars, "Full-width $name ($fullwidth_char) found$context, u= se ASCII $name ($ascii_char) instead"; + } + } + + if (@found_chars) { + foreach my $msg (@found_chars) { + WARN($warning_type, $msg . "\n" . $herecurr); + } + if ($apply_fix && $has_fixes && defined $fixed_ref) { + $fixed_ref->[$fixlinenr] =3D $fixed_line; + } + } + + return scalar @found_chars; +} + my $const_structs; if (show_type("CONST_STRUCT")) { read_words(\$const_structs, $conststructsfile) @@ -2961,6 +3030,11 @@ sub process { $commit_log_has_diff =3D 1; } =20 +# Check for full-width characters in commit message + if ($in_commit_log && show_type("FULLWIDTH_CHARS_COMMIT")) { + check_fullwidth_chars($rawline, " in commit message", "FULLWIDTH_CHARS_= COMMIT", 0, 0, undef, $herecurr); + } + # Check for incorrect file permissions if ($line =3D~ /^new (file )?mode.*[7531]\d{0,2}$/) { my $permhere =3D $here . "FILE: $realfile\n"; @@ -3266,6 +3340,11 @@ sub process { "A patch subject line should describe the change not the tool that= found it\n" . $herecurr); } =20 +# Check for full-width characters in Subject line + if ($in_header_lines && $line =3D~ /^Subject:/i && show_type("FULLWIDTH_= CHARS_SUBJECT")) { + check_fullwidth_chars($rawline, " in subject line", "FULLWIDTH_CHARS_SU= BJECT", 0, 0, undef, $herecurr); + } + # Check for Gerrit Change-Ids not in any patch context if ($realfile eq '' && !$has_patch_separator && $line =3D~ /^\s*change-i= d:/i) { if (ERROR("GERRIT_CHANGE_ID", @@ -3974,6 +4053,11 @@ sub process { } } =20 +# check for full-width characters (full-width punctuation marks, etc.) + if ($rawline =3D~ /^\+/ && show_type("FULLWIDTH_CHARS")) { + check_fullwidth_chars($rawline, "", "FULLWIDTH_CHARS", $fix, $fixlinenr= , \@fixed, $herecurr); + } + # check multi-line statement indentation matches previous line if ($perl_version_ok && $prevline =3D~ /^\+([ \t]*)((?:$c90_Keywords(?:\s+if)\s*)|(?:$Declar= e\s*)?(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*|(?:\*\s*)*$Lval\s*=3D\s*$Ident\s*= )\(.*(\&\&|\|\||,)\s*$/) { --=20 2.20.1