On Mon, 14 Mar 2022, Joe Perches wrote:
> Care to describe _how_ coccinelle was helpful in finding
> these typos in comments?
First, Coccinelle can bind a metavariable to the comments before, within
and after anohter kind of term. So I collected the comments before,
within, and after statements and declarations.
Second, I also used Coccinelle to collect all of the identifiers
referenced in the same file, and discarded all of these words from
consideration.
Otherwise, it's the python library enchant for a dictionary, and some
hacks to reduce the number of false positives, including dropping words
that occur multiple times. The results are still maybe 90% false
positives, though.
The semantic patch is attached. It gives around 30K results for the
current linux-next.
julia@initialize:ocaml@
@@
let seen = Hashtbl.create 101
let bseen = Hashtbl.create 101
let wseen = Hashtbl.create 101
let ids = Hashtbl.create 101
exception NotOK
let okw = ["aren";"isn";"wasn";"doesn";"didn";"weren";"shouldn";"couldn";"wouldn";"hasn";"haven";"linux";"hotplug";"cpu";"ifdef";"ifndef";"endif";"struct"]
let add i =
(if not (Hashtbl.mem ids i) then Hashtbl.add ids i ());
let pieces = Str.split (Str.regexp "_") i in
List.iter
(fun i -> if not (Hashtbl.mem ids i) then Hashtbl.add ids i ())
pieces;
false
let hasvowel s =
let vowels = ['a';'e';'i';'o';'u';'y';'A';'E';'I';'O';'U';'Y'] in
try
String.iter
(fun c ->
if List.mem c vowels
then raise NotOK)
s;
false
with NotOK -> true
let only_letters s =
let islower c = 'a' <= c && c <= 'z' in
let isupper c = 'A' <= c && c <= 'Z' in
try
String.iteri
(fun i c ->
let ok =
if i = 0
then islower c || isupper c
else islower c in
if not ok
then raise NotOK)
s;
true
with NotOK -> false
let check bad loc p c =
if not(Hashtbl.mem seen c)
then
begin
Hashtbl.add seen c ();
let pieces = Str.split (Str.regexp "\\b") c in
List.iter
(fun word ->
if String.length word <= 2 || not(only_letters word) || Hashtbl.mem ids word || List.mem word !bad || not(hasvowel word)
then ()
else
let word = String.uncapitalize_ascii word in
if List.mem word okw
then ()
else
let res =
try Hashtbl.find wseen word
with Not_found ->
let cmd =
Printf.sprintf "python spell.py %s" word in
let v = Common.cmd_to_list cmd in
Hashtbl.add wseen word v;
v in
List.iter
(fun wd ->
bad := word :: !bad;
Common.hashadd bseen word (loc,word,p))
res)
pieces
end
@script:ocaml@
@@
Hashtbl.clear seen
@identifier@
identifier i : script:ocaml() { add i };
@@
i
@r1@
comments c;
statement S;
position p;
@@
S@c@p
@script:ocaml@
c << r1.c;
p << r1.p;
@@
let bad = ref [] in
List.iter
(function c->
let (cb,ci,ca) = c in
List.iter (check bad "before" p) cb;
List.iter (check bad "within" p) ci;
List.iter (check bad "after" p) ca)
c
@r2@
comments c;
declaration d;
position p;
@@
d@c@p
@script:ocaml@
c << r2.c;
p << r2.p;
@@
let bad = ref [] in
List.iter
(function c->
let (cb,ci,ca) = c in
List.iter (check bad "before" p) cb;
List.iter (check bad "within" p) ci;
List.iter (check bad "after" p) ca)
c
@finalize:ocaml@
bseen << merge.bseen;
@@
List.iter
(fun bseen ->
Hashtbl.iter
(fun word l ->
match !l with
[(loc,word,p)] ->
Coccilib.print_main
(Printf.sprintf "problem with %s comment word: %s" loc word)
p
| _ -> ())
bseen)
bseen