[PATCH v2 2/2] perf trace: Rewrite BPF code to pass the verifier

Howard Chu posted 2 patches 1 month, 2 weeks ago
[PATCH v2 2/2] perf trace: Rewrite BPF code to pass the verifier
Posted by Howard Chu 1 month, 2 weeks ago
Rewrite the code to add more memory bound checking in order to pass the
BPF verifier, no logic is changed.

This rewrite is centered around two main ideas:

- Always use a variable instead of an expression in if block's condition,
  so BPF verifier keeps track of the correct register.
- Delay the check as late as possible, just before the BPF function
  call.

Things that can be done better still:

- Instead of allowing a theoretical maximum of a 6-argument augmentation
  payload, reduce the payload to a smaller fixed size.

Signed-off-by: Howard Chu <howardchu95@gmail.com>
---
 .../bpf_skel/augmented_raw_syscalls.bpf.c     | 122 ++++++++++--------
 1 file changed, 67 insertions(+), 55 deletions(-)

diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index b2f17cca014b..9ae459faac4b 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -157,9 +157,9 @@ static inline int augmented__output(void *ctx, struct augmented_args_payload *ar
 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
 }
 
-static inline int augmented__beauty_output(void *ctx, void *data, int len)
+static inline int augmented__beauty_output(void *ctx, struct beauty_payload_enter *args, int len)
 {
-	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
+	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
 }
 
 static inline
@@ -277,25 +277,31 @@ int sys_enter_rename(struct syscall_enter_args *args)
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *oldpath_arg = (const void *)args->args[0],
 		   *newpath_arg = (const void *)args->args[1];
-	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
+	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len, aligned_size;
 
         if (augmented_args == NULL)
-                return 1; /* Failure: don't filter */
+		goto failure;
 
 	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
 
 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
-	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
-	len += augmented_args->arg.size;
+	aligned_size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
+	augmented_args->arg.size = aligned_size;
+	len += aligned_size;
+
+	/* Every read from userspace is limited to value size */
+	if (aligned_size > sizeof(augmented_args->arg.value))
+		goto failure;
 
-	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
+	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + aligned_size;
 
 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
 	arg2->size = newpath_len;
-
 	len += newpath_len;
 
 	return augmented__output(args, augmented_args, len);
+failure:
+	return 1; /* Failure: don't filter */
 }
 
 SEC("tp/syscalls/sys_enter_renameat2")
@@ -304,25 +310,31 @@ int sys_enter_renameat2(struct syscall_enter_args *args)
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *oldpath_arg = (const void *)args->args[1],
 		   *newpath_arg = (const void *)args->args[3];
-	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
+	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len, aligned_size;
 
         if (augmented_args == NULL)
-                return 1; /* Failure: don't filter */
+		goto failure;
 
 	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
 
 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
-	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
-	len += augmented_args->arg.size;
+	aligned_size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
+	augmented_args->arg.size = aligned_size;
+	len += aligned_size;
 
-	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
+	/* Every read from userspace is limited to value size */
+	if (aligned_size > sizeof(augmented_args->arg.value))
+		goto failure;
+
+	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + aligned_size;
 
 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
 	arg2->size = newpath_len;
-
 	len += newpath_len;
 
 	return augmented__output(args, augmented_args, len);
+failure:
+	return 1; /* Failure: don't filter */
 }
 
 #define PERF_ATTR_SIZE_VER0     64      /* sizeof first published struct */
@@ -422,12 +434,12 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 
 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 {
-	bool augmented, do_output = false;
-	int zero = 0, size, aug_size, index, output = 0,
-	    value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
+	bool do_augment = false;
+	int zero = 0, value_size = sizeof(struct augmented_arg) - sizeof(u64);
 	unsigned int nr, *beauty_map;
 	struct beauty_payload_enter *payload;
-	void *arg, *payload_offset;
+	void *payload_offset, *value_offset;
+	u64 len = 0; /* has to be u64, otherwise it won't pass the verifier */
 
 	/* fall back to do predefined tail call */
 	if (args == NULL)
@@ -436,16 +448,18 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 	/* use syscall number to get beauty_map entry */
 	nr             = (__u32)args->syscall_nr;
 	beauty_map     = bpf_map_lookup_elem(&beauty_map_enter, &nr);
+	if (beauty_map == NULL)
+		return 1;
 
 	/* set up payload for output */
 	payload        = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero);
 	payload_offset = (void *)&payload->aug_args;
-
-	if (beauty_map == NULL || payload == NULL)
+	if (payload == NULL)
 		return 1;
 
 	/* copy the sys_enter header, which has the syscall_nr */
 	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
+	len += sizeof(struct syscall_enter_args);
 
 	/*
 	 * Determine what type of argument and how many bytes to read from user space, using the
@@ -457,52 +471,50 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
 	 */
 	for (int i = 0; i < 6; i++) {
-		arg = (void *)args->args[i];
-		augmented = false;
-		size = beauty_map[i];
-		aug_size = size; /* size of the augmented data read from user space */
+		int augment_size = beauty_map[i];
+		unsigned int augment_size_with_header;
+		void *addr = (void *)args->args[i];
+		bool is_augmented = false;
 
-		if (size == 0 || arg == NULL)
+		if (augment_size == 0 || addr == NULL)
 			continue;
 
-		if (size == 1) { /* string */
-			aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
-			/* minimum of 0 to pass the verifier */
-			if (aug_size < 0)
-				aug_size = 0;
-
-			augmented = true;
-		} else if (size > 0 && size <= value_size) { /* struct */
-			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
-				augmented = true;
-		} else if (size < 0 && size >= -6) { /* buffer */
-			index = -(size + 1);
-			aug_size = args->args[index];
-
-			if (aug_size > TRACE_AUG_MAX_BUF)
-				aug_size = TRACE_AUG_MAX_BUF;
-
-			if (aug_size > 0) {
-				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
-					augmented = true;
-			}
+		value_offset = ((struct augmented_arg *)payload_offset)->value;
+
+		if (augment_size == 1) { /* string */
+			augment_size = bpf_probe_read_user_str(value_offset, value_size, addr);
+			is_augmented = true;
+		} else if (augment_size > 1 && augment_size <= value_size) { /* struct */
+			if (!bpf_probe_read_user(value_offset, value_size, addr))
+				is_augmented = true;
+		} else if (augment_size < 0 && augment_size >= -6) { /* buffer */
+			int index = -(augment_size + 1);
+
+			augment_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index];
+			if (!bpf_probe_read_user(value_offset, augment_size, addr))
+				is_augmented = true;
 		}
 
-		/* write data to payload */
-		if (augmented) {
-			int written = offsetof(struct augmented_arg, value) + aug_size;
+		/* Augmented data size is limited to value size */
+		if (augment_size > value_size)
+			augment_size = value_size;
+
+		/* Explicitly define this variable to pass the verifier */
+		augment_size_with_header = sizeof(u64) + augment_size;
 
-			((struct augmented_arg *)payload_offset)->size = aug_size;
-			output += written;
-			payload_offset += written;
-			do_output = true;
+		/* Write data to payload */
+		if (is_augmented && augment_size_with_header <= sizeof(struct augmented_arg)) {
+			((struct augmented_arg *)payload_offset)->size = augment_size;
+			do_augment      = true;
+			len            += augment_size_with_header;
+			payload_offset += augment_size_with_header;
 		}
 	}
 
-	if (!do_output)
+	if (!do_augment || len > sizeof(struct beauty_payload_enter))
 		return 1;
 
-	return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
+	return augmented__beauty_output(ctx, payload, len);
 }
 
 SEC("tp/raw_syscalls/sys_enter")
-- 
2.43.0