[v3] drm: Optimise for continuous memory allocation

[PATCH v3] drm: Optimise for continuous memory allocation

Posted by xinhui pan 2 years, 9 months ago

Currently drm-buddy does not have full knowledge of continuous memory.

Lets consider scenario below.
order 1:    L		    R
order 0: LL	LR	RL	RR
for order 1 allocation, it can offer L or R or LR+RL.

For now, we only implement L or R case for continuous memory allocation.
So this patch aims to implement the LR+RL case.

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
change from v2:
search continuous block in nearby root if needed

change from v1:
implement top-down continuous allocation
---
 drivers/gpu/drm/drm_buddy.c | 78 +++++++++++++++++++++++++++++++++----
 1 file changed, 71 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 11bb59399471..ff58eb3136d2 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -386,6 +386,58 @@ alloc_range_bias(struct drm_buddy *mm,
 	return ERR_PTR(err);
 }
 
+static struct drm_buddy_block *
+find_continuous_blocks(struct drm_buddy *mm,
+		       int order,
+		       unsigned long flags,
+		       struct drm_buddy_block **rn)
+{
+	struct list_head *head = &mm->free_list[order];
+	struct drm_buddy_block *node, *parent, *free_node, *max_node = NULL;
+	int i;
+
+	list_for_each_entry(free_node, head, link) {
+		if (max_node) {
+			if (!(flags & DRM_BUDDY_TOPDOWN_ALLOCATION))
+				break;
+
+			if (drm_buddy_block_offset(free_node) <
+			    drm_buddy_block_offset(max_node))
+				continue;
+		}
+
+		parent = free_node;
+		do {
+			node = parent;
+			parent = parent->parent;
+		} while (parent && parent->right == node);
+
+		if (!parent) {
+			for (i = 0; i < mm->n_roots - 1; i++)
+				if (mm->roots[i] == node)
+					break;
+			if (i == mm->n_roots - 1)
+				continue;
+			node = mm->roots[i + 1];
+		} else {
+			node = parent->right;
+		}
+
+		while (drm_buddy_block_is_split(node))
+			node = node->left;
+
+		if (drm_buddy_block_is_free(node) &&
+		    drm_buddy_block_order(node) == order) {
+			*rn = node;
+			max_node = free_node;
+			BUG_ON(drm_buddy_block_offset(node) !=
+				drm_buddy_block_offset(max_node) +
+				drm_buddy_block_size(mm, max_node));
+		}
+	}
+	return max_node;
+}
+
 static struct drm_buddy_block *
 get_maxblock(struct list_head *head)
 {
@@ -637,7 +689,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
 			   struct list_head *blocks,
 			   unsigned long flags)
 {
-	struct drm_buddy_block *block = NULL;
+	struct drm_buddy_block *block = NULL, *rblock = NULL;
 	unsigned int min_order, order;
 	unsigned long pages;
 	LIST_HEAD(allocated);
@@ -689,17 +741,29 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
 				break;
 
 			if (order-- == min_order) {
+				if (!(flags & DRM_BUDDY_RANGE_ALLOCATION) &&
+				    min_order != 0 && pages == BIT(order + 1)) {
+					block = find_continuous_blocks(mm,
+								       order,
+								       flags,
+								       &rblock);
+					if (block)
+						break;
+				}
 				err = -ENOSPC;
 				goto err_free;
 			}
 		} while (1);
 
-		mark_allocated(block);
-		mm->avail -= drm_buddy_block_size(mm, block);
-		kmemleak_update_trace(block);
-		list_add_tail(&block->link, &allocated);
-
-		pages -= BIT(order);
+		do {
+			mark_allocated(block);
+			mm->avail -= drm_buddy_block_size(mm, block);
+			kmemleak_update_trace(block);
+			list_add_tail(&block->link, &allocated);
+			pages -= BIT(order);
+			block = rblock;
+			rblock = NULL;
+		} while (block);
 
 		if (!pages)
 			break;
-- 
2.34.1

Re: [PATCH v3] drm: Optimise for continuous memory allocation

Posted by Arunpravin Paneer Selvam 2 years, 9 months ago

Hi Xinhui,

On 11/28/2022 12:04 PM, xinhui pan wrote:
> Currently drm-buddy does not have full knowledge of continuous memory.
>
> Lets consider scenario below.
> order 1:    L		    R
> order 0: LL	LR	RL	RR
> for order 1 allocation, it can offer L or R or LR+RL.
>
> For now, we only implement L or R case for continuous memory allocation.
> So this patch aims to implement the LR+RL case.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
> change from v2:
> search continuous block in nearby root if needed
>
> change from v1:
> implement top-down continuous allocation
> ---
>   drivers/gpu/drm/drm_buddy.c | 78 +++++++++++++++++++++++++++++++++----
>   1 file changed, 71 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
> index 11bb59399471..ff58eb3136d2 100644
> --- a/drivers/gpu/drm/drm_buddy.c
> +++ b/drivers/gpu/drm/drm_buddy.c
> @@ -386,6 +386,58 @@ alloc_range_bias(struct drm_buddy *mm,
>   	return ERR_PTR(err);
>   }
>   
> +static struct drm_buddy_block *
> +find_continuous_blocks(struct drm_buddy *mm,
> +		       int order,
> +		       unsigned long flags,
> +		       struct drm_buddy_block **rn)
> +{
> +	struct list_head *head = &mm->free_list[order];
> +	struct drm_buddy_block *node, *parent, *free_node, *max_node = NULL;
NIT: We usually name the variable as *block or ***_block for drm buddy 
and we have *node or ***_node for drm mm manager.
> +	int i;
> +
> +	list_for_each_entry(free_node, head, link) {
> +		if (max_node) {
> +			if (!(flags & DRM_BUDDY_TOPDOWN_ALLOCATION))
> +				break;
> +
> +			if (drm_buddy_block_offset(free_node) <
> +			    drm_buddy_block_offset(max_node))
> +				continue;
> +		}
> +
> +		parent = free_node;
> +		do {
> +			node = parent;
> +			parent = parent->parent;
> +		} while (parent && parent->right == node);
> +
> +		if (!parent) {
> +			for (i = 0; i < mm->n_roots - 1; i++)
> +				if (mm->roots[i] == node)
> +					break;
> +			if (i == mm->n_roots - 1)
> +				continue;
> +			node = mm->roots[i + 1];
> +		} else {
> +			node = parent->right;
> +		}
> +
> +		while (drm_buddy_block_is_split(node))
> +			node = node->left;
> +
> +		if (drm_buddy_block_is_free(node) &&
> +		    drm_buddy_block_order(node) == order) {
> +			*rn = node;
> +			max_node = free_node;
> +			BUG_ON(drm_buddy_block_offset(node) !=
> +				drm_buddy_block_offset(max_node) +
> +				drm_buddy_block_size(mm, max_node));
> +		}
> +	}
> +	return max_node;
> +}
> +
>   static struct drm_buddy_block *
>   get_maxblock(struct list_head *head)
>   {
> @@ -637,7 +689,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
>   			   struct list_head *blocks,
>   			   unsigned long flags)
>   {
> -	struct drm_buddy_block *block = NULL;
> +	struct drm_buddy_block *block = NULL, *rblock = NULL;
>   	unsigned int min_order, order;
>   	unsigned long pages;
>   	LIST_HEAD(allocated);
> @@ -689,17 +741,29 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
>   				break;
>   
>   			if (order-- == min_order) {
> +				if (!(flags & DRM_BUDDY_RANGE_ALLOCATION) &&
> +				    min_order != 0 && pages == BIT(order + 1)) {
> +					block = find_continuous_blocks(mm,
> +								       order,
> +								       flags,
> +								       &rblock);
> +					if (block)
> +						break;
> +				}
>   				err = -ENOSPC;
>   				goto err_free;
>   			}
>   		} while (1);
>   
> -		mark_allocated(block);
> -		mm->avail -= drm_buddy_block_size(mm, block);
> -		kmemleak_update_trace(block);
> -		list_add_tail(&block->link, &allocated);
> -
> -		pages -= BIT(order);
> +		do {
> +			mark_allocated(block);
> +			mm->avail -= drm_buddy_block_size(mm, block);
> +			kmemleak_update_trace(block);
> +			list_add_tail(&block->link, &allocated);
> +			pages -= BIT(order);
> +			block = rblock;
> +			rblock = NULL;
> +		} while (block);
I think with this approach, if we are lucky enough we may get contiguous 
blocks in one order level down in RL
combination from the freelist?

Regards,
Arun
>   
>   		if (!pages)
>   			break;

回复: [PATCH v3] drm: Optimise for continuous memory allocation

Posted by Pan, Xinhui 2 years, 9 months ago

[AMD Official Use Only - General]

Hi Arun,
Thanks for your reply. comments are inline.
________________________________________
发件人: Paneer Selvam, Arunpravin <Arunpravin.PaneerSelvam@amd.com>
发送时间: 2022年11月29日 1:09
收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
抄送: linux-kernel@vger.kernel.org; dri-devel@lists.freedesktop.org; matthew.auld@intel.com; daniel@ffwll.ch; Koenig, Christian
主题: Re: [PATCH v3] drm: Optimise for continuous memory allocation

Hi Xinhui,

On 11/28/2022 12:04 PM, xinhui pan wrote:
> Currently drm-buddy does not have full knowledge of continuous memory.
>
> Lets consider scenario below.
> order 1:    L             R
> order 0: LL   LR      RL      RR
> for order 1 allocation, it can offer L or R or LR+RL.
>
> For now, we only implement L or R case for continuous memory allocation.
> So this patch aims to implement the LR+RL case.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
> change from v2:
> search continuous block in nearby root if needed
>
> change from v1:
> implement top-down continuous allocation
> ---
>   drivers/gpu/drm/drm_buddy.c | 78 +++++++++++++++++++++++++++++++++----
>   1 file changed, 71 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
> index 11bb59399471..ff58eb3136d2 100644
> --- a/drivers/gpu/drm/drm_buddy.c
> +++ b/drivers/gpu/drm/drm_buddy.c
> @@ -386,6 +386,58 @@ alloc_range_bias(struct drm_buddy *mm,
>       return ERR_PTR(err);
>   }
>
> +static struct drm_buddy_block *
> +find_continuous_blocks(struct drm_buddy *mm,
> +                    int order,
> +                    unsigned long flags,
> +                    struct drm_buddy_block **rn)
> +{
> +     struct list_head *head = &mm->free_list[order];
> +     struct drm_buddy_block *node, *parent, *free_node, *max_node = NULL;
NIT: We usually name the variable as *block or ***_block for drm buddy
and we have *node or ***_node for drm mm manager.

[xh] Oh, yes. The code naming is important. Will fix it.

> +     int i;
> +
> +     list_for_each_entry(free_node, head, link) {
> +             if (max_node) {
> +                     if (!(flags & DRM_BUDDY_TOPDOWN_ALLOCATION))
> +                             break;
> +
> +                     if (drm_buddy_block_offset(free_node) <
> +                         drm_buddy_block_offset(max_node))
> +                             continue;
> +             }
> +
> +             parent = free_node;
> +             do {
> +                     node = parent;
> +                     parent = parent->parent;
> +             } while (parent && parent->right == node);
> +
> +             if (!parent) {
> +                     for (i = 0; i < mm->n_roots - 1; i++)
> +                             if (mm->roots[i] == node)
> +                                     break;
> +                     if (i == mm->n_roots - 1)
> +                             continue;
> +                     node = mm->roots[i + 1];
> +             } else {
> +                     node = parent->right;
> +             }
> +
> +             while (drm_buddy_block_is_split(node))
> +                     node = node->left;
> +
> +             if (drm_buddy_block_is_free(node) &&
> +                 drm_buddy_block_order(node) == order) {
> +                     *rn = node;
> +                     max_node = free_node;
> +                     BUG_ON(drm_buddy_block_offset(node) !=
> +                             drm_buddy_block_offset(max_node) +
> +                             drm_buddy_block_size(mm, max_node));
> +             }
> +     }
> +     return max_node;
> +}
> +
>   static struct drm_buddy_block *
>   get_maxblock(struct list_head *head)
>   {
> @@ -637,7 +689,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
>                          struct list_head *blocks,
>                          unsigned long flags)
>   {
> -     struct drm_buddy_block *block = NULL;
> +     struct drm_buddy_block *block = NULL, *rblock = NULL;
>       unsigned int min_order, order;
>       unsigned long pages;
>       LIST_HEAD(allocated);
> @@ -689,17 +741,29 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
>                               break;
>
>                       if (order-- == min_order) {
> +                             if (!(flags & DRM_BUDDY_RANGE_ALLOCATION) &&
> +                                 min_order != 0 && pages == BIT(order + 1)) {
> +                                     block = find_continuous_blocks(mm,
> +                                                                    order,
> +                                                                    flags,
> +                                                                    &rblock);
> +                                     if (block)
> +                                             break;
> +                             }
>                               err = -ENOSPC;
>                               goto err_free;
>                       }
>               } while (1);
>
> -             mark_allocated(block);
> -             mm->avail -= drm_buddy_block_size(mm, block);
> -             kmemleak_update_trace(block);
> -             list_add_tail(&block->link, &allocated);
> -
> -             pages -= BIT(order);
> +             do {
> +                     mark_allocated(block);
> +                     mm->avail -= drm_buddy_block_size(mm, block);
> +                     kmemleak_update_trace(block);
> +                     list_add_tail(&block->link, &allocated);
> +                     pages -= BIT(order);
> +                     block = rblock;
> +                     rblock = NULL;
> +             } while (block);
I think with this approach, if we are lucky enough we may get contiguous
blocks in one order level down in RL
combination from the freelist?

[xh] That is just what I did at first time like something below.
       list_for_each_entry(node, head, link)
               list_for_each_entry_reverse(rnode, head, link)

I do the test with one ROCM application while running gdm restart background which is a very normal scenario.

The test result shows there is about 4% chance to find the continuous blocks in this case. This 4% is really good enough as eviction is much more expensive.
So after that, I need take care of the rest 96% to not introduce too much workload.

Compared with the two loops, walking through the tree is a little cheaper.

thanks
xinhui

Regards,
Arun
>
>               if (!pages)
>                       break;