riscv: module: Optimize PLT/GOT entry counting

[PATCH] riscv: module: Optimize PLT/GOT entry counting

Posted by Samuel Holland 10 months ago

perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
inside module_frob_arch_sections(). This is because amdgpu.ko contains
about 300000 relocations in its .rela.text section, and the algorithm in
count_max_entries() takes quadratic time.

Apply two optimizations from the arm64 code, which together reduce the
total execution time by 99.57%. First, sort the relocations so duplicate
entries are adjacent. Second, reduce the number of relocations that must
be sorted by filtering to only relocations that need PLT/GOT entries, as
done in commit d4e0340919fb ("arm64/module: Optimize module load time by
optimizing PLT counting").

Unlike the arm64 code, here the filtering and sorting is done in a
scratch buffer, because the HI20 relocation search optimization in
apply_relocate_add() depends on the original order of the relocations.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
---

 arch/riscv/kernel/module-sections.c | 66 +++++++++++++++++++++++++----
 1 file changed, 57 insertions(+), 9 deletions(-)

diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
index e264e59e596e..91d4f0fbd0af 100644
--- a/arch/riscv/kernel/module-sections.c
+++ b/arch/riscv/kernel/module-sections.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <linux/sort.h>
 
 unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
 {
@@ -55,19 +56,27 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
 	return (unsigned long)&plt[i];
 }
 
-static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
+#define cmp_3way(a, b)	((a) < (b) ? -1 : (a) > (b))
+
+static int cmp_rela(const void *a, const void *b)
 {
-	return x->r_info == y->r_info && x->r_addend == y->r_addend;
+	const Elf_Rela *x = a, *y = b;
+	int i;
+
+	/* sort by type, symbol index and addend */
+	i = cmp_3way(x->r_info, y->r_info);
+	if (i == 0)
+		i = cmp_3way(x->r_addend, y->r_addend);
+	return i;
 }
 
 static bool duplicate_rela(const Elf_Rela *rela, int idx)
 {
-	int i;
-	for (i = 0; i < idx; i++) {
-		if (is_rela_equal(&rela[i], &rela[idx]))
-			return true;
-	}
-	return false;
+	/*
+	 * Entries are sorted by type, symbol index and addend. That means
+	 * that, if a duplicate entry exists, it must be in the preceding slot.
+	 */
+	return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
 }
 
 static void count_max_entries(Elf_Rela *relas, int num,
@@ -87,11 +96,33 @@ static void count_max_entries(Elf_Rela *relas, int num,
 	}
 }
 
+static bool rela_needs_plt_got(const Elf_Rela *rela)
+{
+	unsigned int type = ELF_R_TYPE(rela->r_info);
+
+	return type == R_RISCV_CALL_PLT || type == R_RISCV_GOT_HI20;
+}
+
+/* Copy PLT and GOT relas to the scratch array. */
+static unsigned int partition_plt_got_relas(const Elf_Rela *relas, Elf_Rela *scratch,
+					    unsigned int num_rela)
+{
+	int j = 0;
+
+	for (int i = 0; i < num_rela; i++)
+		if (rela_needs_plt_got(&relas[i]))
+			scratch[j++] = relas[i];
+
+	return j;
+}
+
 int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 			      char *secstrings, struct module *mod)
 {
 	unsigned int num_plts = 0;
 	unsigned int num_gots = 0;
+	Elf_Rela *scratch = NULL;
+	size_t scratch_size = 0;
 	int i;
 
 	/*
@@ -132,9 +163,26 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 		if (!(dst_sec->sh_flags & SHF_EXECINSTR))
 			continue;
 
-		count_max_entries(relas, num_rela, &num_plts, &num_gots);
+		/*
+		 * apply_relocate_add() relies on HI20 and LO12 relocation pairs being
+		 * close together, so sort a copy of the section to avoid interfering.
+		 */
+		if (sechdrs[i].sh_size > scratch_size) {
+			scratch_size = sechdrs[i].sh_size;
+			scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
+			if (!scratch)
+				return -ENOMEM;
+		}
+
+		/* sort relocations requiring a PLT or GOT entry so duplicates are adjacent */
+		num_rela = partition_plt_got_relas(relas, scratch, num_rela);
+		sort(scratch, num_rela, sizeof(Elf_Rela), cmp_rela, NULL);
+		count_max_entries(scratch, num_rela, &num_plts, &num_gots);
 	}
 
+	if (scratch)
+		kvfree(scratch);
+
 	mod->arch.plt.shdr->sh_type = SHT_NOBITS;
 	mod->arch.plt.shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
 	mod->arch.plt.shdr->sh_addralign = L1_CACHE_BYTES;
-- 
2.47.0

Re: [PATCH] riscv: module: Optimize PLT/GOT entry counting

Posted by Andrew Jones 10 months ago

On Tue, Apr 08, 2025 at 07:45:16PM -0700, Samuel Holland wrote:
> perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
> inside module_frob_arch_sections(). This is because amdgpu.ko contains
> about 300000 relocations in its .rela.text section, and the algorithm in
> count_max_entries() takes quadratic time.
> 
> Apply two optimizations from the arm64 code, which together reduce the
> total execution time by 99.57%. First, sort the relocations so duplicate
> entries are adjacent. Second, reduce the number of relocations that must
> be sorted by filtering to only relocations that need PLT/GOT entries, as
> done in commit d4e0340919fb ("arm64/module: Optimize module load time by
> optimizing PLT counting").
> 
> Unlike the arm64 code, here the filtering and sorting is done in a
> scratch buffer, because the HI20 relocation search optimization in
> apply_relocate_add() depends on the original order of the relocations.
> 
> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
> ---
> 
>  arch/riscv/kernel/module-sections.c | 66 +++++++++++++++++++++++++----
>  1 file changed, 57 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
> index e264e59e596e..91d4f0fbd0af 100644
> --- a/arch/riscv/kernel/module-sections.c
> +++ b/arch/riscv/kernel/module-sections.c
> @@ -9,6 +9,7 @@
>  #include <linux/kernel.h>
>  #include <linux/module.h>
>  #include <linux/moduleloader.h>
> +#include <linux/sort.h>
>  
>  unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
>  {
> @@ -55,19 +56,27 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
>  	return (unsigned long)&plt[i];
>  }
>  
> -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
> +#define cmp_3way(a, b)	((a) < (b) ? -1 : (a) > (b))
> +
> +static int cmp_rela(const void *a, const void *b)
>  {
> -	return x->r_info == y->r_info && x->r_addend == y->r_addend;
> +	const Elf_Rela *x = a, *y = b;
> +	int i;
> +
> +	/* sort by type, symbol index and addend */
> +	i = cmp_3way(x->r_info, y->r_info);
> +	if (i == 0)
> +		i = cmp_3way(x->r_addend, y->r_addend);
> +	return i;
>  }
>  
>  static bool duplicate_rela(const Elf_Rela *rela, int idx)
>  {
> -	int i;
> -	for (i = 0; i < idx; i++) {
> -		if (is_rela_equal(&rela[i], &rela[idx]))
> -			return true;
> -	}
> -	return false;
> +	/*
> +	 * Entries are sorted by type, symbol index and addend. That means
> +	 * that, if a duplicate entry exists, it must be in the preceding slot.
> +	 */
> +	return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
>  }
>  
>  static void count_max_entries(Elf_Rela *relas, int num,
> @@ -87,11 +96,33 @@ static void count_max_entries(Elf_Rela *relas, int num,
>  	}
>  }
>  
> +static bool rela_needs_plt_got(const Elf_Rela *rela)
> +{
> +	unsigned int type = ELF_R_TYPE(rela->r_info);
> +
> +	return type == R_RISCV_CALL_PLT || type == R_RISCV_GOT_HI20;

I see these two are sufficient to support count_max_entries(), but Charlie
also added support for R_RISCV_PLT32. I don't know enough about relocation
types to know if we need to consider that one here and in
count_max_entries(), so I'm just pointing it out in case it was
overlooked.

> +}
> +
> +/* Copy PLT and GOT relas to the scratch array. */
> +static unsigned int partition_plt_got_relas(const Elf_Rela *relas, Elf_Rela *scratch,
> +					    unsigned int num_rela)
> +{
> +	int j = 0;
> +
> +	for (int i = 0; i < num_rela; i++)
> +		if (rela_needs_plt_got(&relas[i]))
> +			scratch[j++] = relas[i];
> +
> +	return j;
> +}
> +
>  int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>  			      char *secstrings, struct module *mod)
>  {
>  	unsigned int num_plts = 0;
>  	unsigned int num_gots = 0;
> +	Elf_Rela *scratch = NULL;
> +	size_t scratch_size = 0;
>  	int i;
>  
>  	/*
> @@ -132,9 +163,26 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>  		if (!(dst_sec->sh_flags & SHF_EXECINSTR))
>  			continue;
>  
> -		count_max_entries(relas, num_rela, &num_plts, &num_gots);
> +		/*
> +		 * apply_relocate_add() relies on HI20 and LO12 relocation pairs being
> +		 * close together, so sort a copy of the section to avoid interfering.
> +		 */
> +		if (sechdrs[i].sh_size > scratch_size) {
> +			scratch_size = sechdrs[i].sh_size;
> +			scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
> +			if (!scratch)
> +				return -ENOMEM;
> +		}
> +
> +		/* sort relocations requiring a PLT or GOT entry so duplicates are adjacent */
> +		num_rela = partition_plt_got_relas(relas, scratch, num_rela);
> +		sort(scratch, num_rela, sizeof(Elf_Rela), cmp_rela, NULL);
> +		count_max_entries(scratch, num_rela, &num_plts, &num_gots);
>  	}
>  
> +	if (scratch)
> +		kvfree(scratch);
> +
>  	mod->arch.plt.shdr->sh_type = SHT_NOBITS;
>  	mod->arch.plt.shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
>  	mod->arch.plt.shdr->sh_addralign = L1_CACHE_BYTES;
> -- 
> 2.47.0
>

Besides my question above,

Reviewed-by: Andrew Jones <ajones@ventanamicro.com>

Thanks,
drew

Re: [PATCH] riscv: module: Optimize PLT/GOT entry counting

Posted by Samuel Holland 10 months ago

Hi Drew,

On 2025-04-09 5:48 AM, Andrew Jones wrote:
> On Tue, Apr 08, 2025 at 07:45:16PM -0700, Samuel Holland wrote:
>> perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
>> inside module_frob_arch_sections(). This is because amdgpu.ko contains
>> about 300000 relocations in its .rela.text section, and the algorithm in
>> count_max_entries() takes quadratic time.
>>
>> Apply two optimizations from the arm64 code, which together reduce the
>> total execution time by 99.57%. First, sort the relocations so duplicate
>> entries are adjacent. Second, reduce the number of relocations that must
>> be sorted by filtering to only relocations that need PLT/GOT entries, as
>> done in commit d4e0340919fb ("arm64/module: Optimize module load time by
>> optimizing PLT counting").
>>
>> Unlike the arm64 code, here the filtering and sorting is done in a
>> scratch buffer, because the HI20 relocation search optimization in
>> apply_relocate_add() depends on the original order of the relocations.
>>
>> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
>> ---
>>
>>  arch/riscv/kernel/module-sections.c | 66 +++++++++++++++++++++++++----
>>  1 file changed, 57 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
>> index e264e59e596e..91d4f0fbd0af 100644
>> --- a/arch/riscv/kernel/module-sections.c
>> +++ b/arch/riscv/kernel/module-sections.c
>> @@ -9,6 +9,7 @@
>>  #include <linux/kernel.h>
>>  #include <linux/module.h>
>>  #include <linux/moduleloader.h>
>> +#include <linux/sort.h>
>>  
>>  unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
>>  {
>> @@ -55,19 +56,27 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
>>  	return (unsigned long)&plt[i];
>>  }
>>  
>> -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
>> +#define cmp_3way(a, b)	((a) < (b) ? -1 : (a) > (b))
>> +
>> +static int cmp_rela(const void *a, const void *b)
>>  {
>> -	return x->r_info == y->r_info && x->r_addend == y->r_addend;
>> +	const Elf_Rela *x = a, *y = b;
>> +	int i;
>> +
>> +	/* sort by type, symbol index and addend */
>> +	i = cmp_3way(x->r_info, y->r_info);
>> +	if (i == 0)
>> +		i = cmp_3way(x->r_addend, y->r_addend);
>> +	return i;
>>  }
>>  
>>  static bool duplicate_rela(const Elf_Rela *rela, int idx)
>>  {
>> -	int i;
>> -	for (i = 0; i < idx; i++) {
>> -		if (is_rela_equal(&rela[i], &rela[idx]))
>> -			return true;
>> -	}
>> -	return false;
>> +	/*
>> +	 * Entries are sorted by type, symbol index and addend. That means
>> +	 * that, if a duplicate entry exists, it must be in the preceding slot.
>> +	 */
>> +	return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
>>  }
>>  
>>  static void count_max_entries(Elf_Rela *relas, int num,
>> @@ -87,11 +96,33 @@ static void count_max_entries(Elf_Rela *relas, int num,
>>  	}
>>  }
>>  
>> +static bool rela_needs_plt_got(const Elf_Rela *rela)
>> +{
>> +	unsigned int type = ELF_R_TYPE(rela->r_info);
>> +
>> +	return type == R_RISCV_CALL_PLT || type == R_RISCV_GOT_HI20;
> 
> I see these two are sufficient to support count_max_entries(), but Charlie
> also added support for R_RISCV_PLT32. I don't know enough about relocation
> types to know if we need to consider that one here and in
> count_max_entries(), so I'm just pointing it out in case it was
> overlooked.

Yes, R_RISCV_PLT32 should be included. Today if a module were to include a
R_RISCV_PLT32 relocation against a unique symbol, it would hit a BUG_ON in
module_emit_plt_entry() because there would be no space in the PLT. If there was
also a R_RISCV_CALL_PLT relocation against the same symbol, the code would
silently succeed.

Regards,
Samuel

>> +}
>> +
>> +/* Copy PLT and GOT relas to the scratch array. */
>> +static unsigned int partition_plt_got_relas(const Elf_Rela *relas, Elf_Rela *scratch,
>> +					    unsigned int num_rela)
>> +{
>> +	int j = 0;
>> +
>> +	for (int i = 0; i < num_rela; i++)
>> +		if (rela_needs_plt_got(&relas[i]))
>> +			scratch[j++] = relas[i];
>> +
>> +	return j;
>> +}
>> +
>>  int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>>  			      char *secstrings, struct module *mod)
>>  {
>>  	unsigned int num_plts = 0;
>>  	unsigned int num_gots = 0;
>> +	Elf_Rela *scratch = NULL;
>> +	size_t scratch_size = 0;
>>  	int i;
>>  
>>  	/*
>> @@ -132,9 +163,26 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>>  		if (!(dst_sec->sh_flags & SHF_EXECINSTR))
>>  			continue;
>>  
>> -		count_max_entries(relas, num_rela, &num_plts, &num_gots);
>> +		/*
>> +		 * apply_relocate_add() relies on HI20 and LO12 relocation pairs being
>> +		 * close together, so sort a copy of the section to avoid interfering.
>> +		 */
>> +		if (sechdrs[i].sh_size > scratch_size) {
>> +			scratch_size = sechdrs[i].sh_size;
>> +			scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
>> +			if (!scratch)
>> +				return -ENOMEM;
>> +		}
>> +
>> +		/* sort relocations requiring a PLT or GOT entry so duplicates are adjacent */
>> +		num_rela = partition_plt_got_relas(relas, scratch, num_rela);
>> +		sort(scratch, num_rela, sizeof(Elf_Rela), cmp_rela, NULL);
>> +		count_max_entries(scratch, num_rela, &num_plts, &num_gots);
>>  	}
>>  
>> +	if (scratch)
>> +		kvfree(scratch);
>> +
>>  	mod->arch.plt.shdr->sh_type = SHT_NOBITS;
>>  	mod->arch.plt.shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
>>  	mod->arch.plt.shdr->sh_addralign = L1_CACHE_BYTES;
>> -- 
>> 2.47.0
>>
> 
> Besides my question above,
> 
> Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> 
> Thanks,
> drew