[PATCH RESEND] Add comprehensive detection and automatic fixing capability for full-width (Unicode) characters that are commonly mistaken for ASCII punctuation marks. This helps catch input method editor artifacts that can cause compilation errors or formatting issues.

Morduang Zang posted 1 patch 1 month, 2 weeks ago
scripts/checkpatch.pl | 84 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
[PATCH RESEND] Add comprehensive detection and automatic fixing capability for full-width (Unicode) characters that are commonly mistaken for ASCII punctuation marks. This helps catch input method editor artifacts that can cause compilation errors or formatting issues.
Posted by Morduang Zang 1 month, 2 weeks ago
The implementation detects 25 types of full-width characters:
- Basic punctuation: ;,。()!?: 
- Programming brackets: []{}<>
- Assignment and comparison: =
- Arithmetic operators: +-*/\
- Other programming symbols: %#&|

Detection covers three areas:
1. Code lines (lines starting with '+') - FULLWIDTH_CHARS
2. Commit messages - FULLWIDTH_CHARS_COMMIT
3. Subject lines - FULLWIDTH_CHARS_SUBJECT

Example usage:
  ./scripts/checkpatch.pl my_patch.patch
  ./scripts/checkpatch.pl --fix my_patch.patch
  ./scripts/checkpatch.pl --fix-inplace my_source.c

Signed-off-by: Morduang Zang <zhangdandan@uniontech.com>
Signed-off-by: Wangyuli <wangyuli@uniontech.com>
---
 scripts/checkpatch.pl | 84 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index e722dd6fa8ef..f4cb547a470b 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -75,6 +75,41 @@ my $git_command ='export LANGUAGE=en_US.UTF-8; git';
 my $tabsize = 8;
 my ${CONFIG_} = "CONFIG_";
 
+# Full-width character mappings (UTF-8 byte sequences to ASCII)
+my %fullwidth_chars = (
+	# Basic punctuation
+	"\xef\xbc\x9b" => [";", "semicolon", ";"],
+	"\xef\xbc\x8c" => [",", "comma", ","],
+	"\xe3\x80\x82" => [".", "period", "。"],
+	"\xef\xbc\x88" => ["(", "opening parenthesis", "("],
+	"\xef\xbc\x89" => [")", "closing parenthesis", ")"],
+	"\xef\xbc\x81" => ["!", "exclamation mark", "!"],
+	"\xef\xbc\x9f" => ["?", "question mark", "?"],
+	"\xef\xbc\x9a" => [":", "colon", ":"],
+	"\xe3\x80\x80" => [" ", "space", " "],
+	# Programming brackets
+	"\xef\xbc\xbb" => ["[", "left square bracket", "["],
+	"\xef\xbc\xbd" => ["]", "right square bracket", "]"],
+	"\xef\xbd\x9b" => ["{", "left curly bracket", "{"],
+	"\xef\xbd\x9d" => ["}", "right curly bracket", "}"],
+	"\xef\xbc\x9c" => ["<", "less-than sign", "<"],
+	"\xef\xbc\x9e" => [">", "greater-than sign", ">"],
+	# Assignment and comparison
+	"\xef\xbc\x9d" => ["=", "equals sign", "="],
+	# Arithmetic operators
+	"\xef\xbc\x8b" => ["+", "plus sign", "+"],
+	"\xef\xbc\x8d" => ["-", "minus sign", "-"],
+	"\xef\xbc\x8a" => ["*", "asterisk", "*"],
+	"\xef\xbc\x8f" => ["/", "solidus", "/"],
+	"\xef\xbc\xbc" => ["\\", "reverse solidus", "\"],
+	# Other programming symbols
+	"\xef\xbc\x85" => ["%", "percent sign", "%"],
+	"\xef\xbc\x83" => ["#", "number sign", "#"],
+	"\xef\xbc\x86" => ["&", "ampersand", "&"],
+	"\xef\xbd\x9c" => ["|", "vertical line", "|"],
+);
+my $fullwidth_pattern = join('|', map { quotemeta($_) } keys %fullwidth_chars);
+
 my %maybe_linker_symbol; # for externs in c exceptions, when seen in *vmlinux.lds.h
 
 sub help {
@@ -1019,6 +1054,40 @@ sub read_words {
 	return 0;
 }
 
+# Check for full-width characters and optionally fix them
+sub check_fullwidth_chars {
+	my ($line, $context, $warning_type, $apply_fix, $fixlinenr, $fixed_ref, $herecurr) = @_;
+	my @found_chars = ();
+	my $fixed_line = $line;
+	my $has_fixes = 0;
+
+	return 0 unless $line =~ /$fullwidth_pattern/o;
+
+	if ($apply_fix) {
+		$fixed_line =~ s/($fullwidth_pattern)/$fullwidth_chars{$1}[0]/ge;
+		$has_fixes = ($fixed_line ne $line);
+	}
+
+	while ($line =~ /($fullwidth_pattern)/go) {
+		my $fullwidth_byte_seq = $1;
+		if (exists $fullwidth_chars{$fullwidth_byte_seq}) {
+			my ($ascii_char, $name, $fullwidth_char) = @{$fullwidth_chars{$fullwidth_byte_seq}};
+			push @found_chars, "Full-width $name ($fullwidth_char) found$context, use ASCII $name ($ascii_char) instead";
+		}
+	}
+
+	if (@found_chars) {
+		foreach my $msg (@found_chars) {
+			WARN($warning_type, $msg . "\n" . $herecurr);
+		}
+		if ($apply_fix && $has_fixes && defined $fixed_ref) {
+			$fixed_ref->[$fixlinenr] = $fixed_line;
+		}
+	}
+
+	return scalar @found_chars;
+}
+
 my $const_structs;
 if (show_type("CONST_STRUCT")) {
 	read_words(\$const_structs, $conststructsfile)
@@ -2961,6 +3030,11 @@ sub process {
 			$commit_log_has_diff = 1;
 		}
 
+# Check for full-width characters in commit message
+		if ($in_commit_log && show_type("FULLWIDTH_CHARS_COMMIT")) {
+			check_fullwidth_chars($rawline, " in commit message", "FULLWIDTH_CHARS_COMMIT", 0, 0, undef, $herecurr);
+		}
+
 # Check for incorrect file permissions
 		if ($line =~ /^new (file )?mode.*[7531]\d{0,2}$/) {
 			my $permhere = $here . "FILE: $realfile\n";
@@ -3266,6 +3340,11 @@ sub process {
 			     "A patch subject line should describe the change not the tool that found it\n" . $herecurr);
 		}
 
+# Check for full-width characters in Subject line
+		if ($in_header_lines && $line =~ /^Subject:/i && show_type("FULLWIDTH_CHARS_SUBJECT")) {
+			check_fullwidth_chars($rawline, " in subject line", "FULLWIDTH_CHARS_SUBJECT", 0, 0, undef, $herecurr);
+		}
+
 # Check for Gerrit Change-Ids not in any patch context
 		if ($realfile eq '' && !$has_patch_separator && $line =~ /^\s*change-id:/i) {
 			if (ERROR("GERRIT_CHANGE_ID",
@@ -3974,6 +4053,11 @@ sub process {
 			}
 		}
 
+# check for full-width characters (full-width punctuation marks, etc.)
+		if ($rawline =~ /^\+/ && show_type("FULLWIDTH_CHARS")) {
+			check_fullwidth_chars($rawline, "", "FULLWIDTH_CHARS", $fix, $fixlinenr, \@fixed, $herecurr);
+		}
+
 # check multi-line statement indentation matches previous line
 		if ($perl_version_ok &&
 		    $prevline =~ /^\+([ \t]*)((?:$c90_Keywords(?:\s+if)\s*)|(?:$Declare\s*)?(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*|(?:\*\s*)*$Lval\s*=\s*$Ident\s*)\(.*(\&\&|\|\||,)\s*$/) {
-- 
2.20.1