Allocate a rw semaphore and hang off vm_private_data for
synchronization use by vmas that could be involved in pmd sharing. Only
add infrastructure for the new lock here. Actual use will be added in
subsequent patch.
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
include/linux/hugetlb.h | 36 ++++++++-
kernel/fork.c | 6 +-
mm/hugetlb.c | 170 ++++++++++++++++++++++++++++++++++++----
mm/rmap.c | 8 +-
4 files changed, 197 insertions(+), 23 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index acace1a25226..852f911d676e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -126,7 +126,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+void hugetlb_dup_vma_private(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
@@ -214,6 +214,13 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
pgd_t *pgd, int flags);
+void hugetlb_vma_lock_read(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
+void hugetlb_vma_lock_write(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
+
int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pud);
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -225,7 +232,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
#else /* !CONFIG_HUGETLB_PAGE */
-static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
}
@@ -336,6 +343,31 @@ static inline int prepare_hugepage_range(struct file *file,
return -EINVAL;
}
+static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ return 1;
+}
+
+static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
static inline int pmd_huge(pmd_t pmd)
{
return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9470220e8f43..421c143286d2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -675,12 +675,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
}
/*
- * Clear hugetlb-related page reserves for children. This only
- * affects MAP_PRIVATE mappings. Faults generated by the child
- * are not guaranteed to succeed, even if read-only
+ * Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
- reset_vma_resv_huge_pages(tmp);
+ hugetlb_dup_vma_private(tmp);
/*
* Link in the new vma and copy the page table entries.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 758b6844d566..6fb0bff2c7ee 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -91,6 +91,8 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -1008,12 +1010,25 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
-/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+ /*
+ * Clear vm_private_data
+ * - For MAP_PRIVATE mappings, this is the reserve map which does
+ * not apply to children. Faults generated by the children are
+ * not guaranteed to succeed, even if read-only.
+ * - For shared mappings this is a per-vma semaphore that may be
+ * allocated below.
+ */
+ vma->vm_private_data = (void *)0;
if (!(vma->vm_flags & VM_MAYSHARE))
- vma->vm_private_data = (void *)0;
+ return;
+
+ /*
+ * Allocate semaphore if pmd sharing is possible.
+ */
+ hugetlb_vma_lock_alloc(vma);
}
/*
@@ -1044,7 +1059,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
kref_put(&reservations->refs, resv_map_release);
}
- reset_vma_resv_huge_pages(vma);
+ hugetlb_dup_vma_private(vma);
}
/* Returns true if the VMA has associated reserve pages */
@@ -4623,16 +4638,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
}
+
+ hugetlb_vma_lock_alloc(vma);
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
- struct resv_map *resv = vma_resv_map(vma);
+ struct resv_map *resv;
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
long gbl_reserve;
+ hugetlb_vma_lock_free(vma);
+
+ resv = vma_resv_map(vma);
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
@@ -6447,6 +6467,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
return false;
}
+ /*
+ * vma specific semaphore used for pmd sharing synchronization
+ */
+ hugetlb_vma_lock_alloc(vma);
+
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
@@ -6470,12 +6495,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to, ®ions_needed);
-
} else {
/* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
- return false;
+ goto out_err;
chg = to - from;
@@ -6570,6 +6594,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
+ hugetlb_vma_lock_free(vma);
if (!vma || vma->vm_flags & VM_MAYSHARE)
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
@@ -6649,14 +6674,34 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
}
static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ bool check_vma_lock)
{
+#ifdef CONFIG_USERFAULTFD
+ if (uffd_disable_huge_pmd_share(vma))
+ return false;
+#endif
/*
* check on proper vm_flags and page table alignment
*/
- if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end))
- return true;
- return false;
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return false;
+ if (check_vma_lock && !vma->vm_private_data)
+ return false;
+ if (!range_in_vma(vma, start, end))
+ return false;
+ return true;
+}
+
+static bool vma_pmd_shareable(struct vm_area_struct *vma)
+{
+ unsigned long start = ALIGN(vma->vm_start, PUD_SIZE),
+ end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+ if (start >= end)
+ return false;
+
+ return __vma_aligned_range_pmd_shareable(vma, start, end, false);
}
static bool vma_addr_pmd_shareable(struct vm_area_struct *vma,
@@ -6665,15 +6710,11 @@ static bool vma_addr_pmd_shareable(struct vm_area_struct *vma,
unsigned long start = addr & PUD_MASK;
unsigned long end = start + PUD_SIZE;
- return __vma_aligned_range_pmd_shareable(vma, start, end);
+ return __vma_aligned_range_pmd_shareable(vma, start, end, true);
}
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
-#ifdef CONFIG_USERFAULTFD
- if (uffd_disable_huge_pmd_share(vma))
- return false;
-#endif
return vma_addr_pmd_shareable(vma, addr);
}
@@ -6704,6 +6745,95 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
*end = ALIGN(*end, PUD_SIZE);
}
+static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+ vma->vm_private_data;
+}
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ down_read((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ up_read((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ down_write((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ up_write((struct rw_semaphore *)vma->vm_private_data);
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ if (!__vma_shareable_flags_pmd(vma))
+ return 1;
+
+ return down_write_trylock((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ lockdep_assert_held((struct rw_semaphore *)
+ vma->vm_private_data);
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+ /*
+ * Only present in sharable vmas. See comment in
+ * __unmap_hugepage_range_final about the neeed to check both
+ * VM_SHARED and VM_MAYSHARE in free path
+ */
+ if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
+ return;
+
+ if (vma->vm_private_data) {
+ kfree(vma->vm_private_data);
+ vma->vm_private_data = NULL;
+ }
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+ struct rw_semaphore *vma_sema;
+
+ /* Only establish in (flags) sharable vmas */
+ if (!vma || !(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ /* Should never get here with non-NULL vm_private_data */
+ if (vma->vm_private_data)
+ return;
+
+ /* Check size/alignment for pmd sharing possible */
+ if (!vma_pmd_shareable(vma))
+ return;
+
+ vma_sema = kmalloc(sizeof(*vma_sema), GFP_KERNEL);
+ if (!vma_sema)
+ /*
+ * If we can not allocate semaphore, then vma can not
+ * participate in pmd sharing.
+ */
+ return;
+
+ init_rwsem(vma_sema);
+ vma->vm_private_data = vma_sema;
+}
+
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
@@ -6790,6 +6920,14 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
}
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+}
+
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
diff --git a/mm/rmap.c b/mm/rmap.c
index ad9c97c6445c..55209e029847 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
* mm->mmap_lock
* mapping->invalidate_lock (in filemap_fault)
* page->flags PG_locked (lock_page)
- * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
* mapping->i_mmap_rwsem
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
@@ -44,6 +44,12 @@
* anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
+ *
+ * hugetlbfs PageHuge() take locks in this order:
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * vma_lock (hugetlb specific lock for pmd_sharing)
+ * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
+ * page->flags PG_locked (lock_page)
*/
#include <linux/mm.h>
--
2.37.1
On 2022/8/25 1:57, Mike Kravetz wrote:
> Allocate a rw semaphore and hang off vm_private_data for
> synchronization use by vmas that could be involved in pmd sharing. Only
> add infrastructure for the new lock here. Actual use will be added in
> subsequent patch.
>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
<snip>
> +static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
> +{
> + /*
> + * Only present in sharable vmas. See comment in
> + * __unmap_hugepage_range_final about the neeed to check both
s/neeed/need/
> + * VM_SHARED and VM_MAYSHARE in free path
I think there might be some wrong checks around this patch. As above comment said, we
need to check both flags, so we should do something like below instead?
if (!(vma->vm_flags & (VM_MAYSHARE | VM_SHARED) == (VM_MAYSHARE | VM_SHARED)))
> + */
> + if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
> + return;
> +
> + if (vma->vm_private_data) {
> + kfree(vma->vm_private_data);
> + vma->vm_private_data = NULL;
> + }
> +}
> +
> +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
> +{
> + struct rw_semaphore *vma_sema;
> +
> + /* Only establish in (flags) sharable vmas */
> + if (!vma || !(vma->vm_flags & VM_MAYSHARE))
> + return;
> +
> + /* Should never get here with non-NULL vm_private_data */
We can get here with non-NULL vm_private_data when called from hugetlb_vm_op_open during fork?
Also there's one missing change on comment:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0617d64d718..4bc844a1d312 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -863,7 +863,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
* is guaranteed to have their future faults succeed.
*
- * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * With the exception of hugetlb_dup_vma_private() which is called at fork(),
* the reserve counters are updated with the hugetlb_lock held. It is safe
* to reset the VMA at fork() time as it is not in use yet and there is no
* chance of the global counters getting corrupted as a result of the values.
Otherwise this patch looks good to me. Thanks.
Thanks,
Miaohe Lin
On 08/27/22 17:30, Miaohe Lin wrote:
> On 2022/8/25 1:57, Mike Kravetz wrote:
> > Allocate a rw semaphore and hang off vm_private_data for
> > synchronization use by vmas that could be involved in pmd sharing. Only
> > add infrastructure for the new lock here. Actual use will be added in
> > subsequent patch.
> >
> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>
> <snip>
>
> > +static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
> > +{
> > + /*
> > + * Only present in sharable vmas. See comment in
> > + * __unmap_hugepage_range_final about the neeed to check both
>
> s/neeed/need/
>
> > + * VM_SHARED and VM_MAYSHARE in free path
>
> I think there might be some wrong checks around this patch. As above comment said, we
> need to check both flags, so we should do something like below instead?
>
> if (!(vma->vm_flags & (VM_MAYSHARE | VM_SHARED) == (VM_MAYSHARE | VM_SHARED)))
>
> > + */
Thanks. I will update.
> > + if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
> > + return;
> > +
> > + if (vma->vm_private_data) {
> > + kfree(vma->vm_private_data);
> > + vma->vm_private_data = NULL;
> > + }
> > +}
> > +
> > +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
> > +{
> > + struct rw_semaphore *vma_sema;
> > +
> > + /* Only establish in (flags) sharable vmas */
> > + if (!vma || !(vma->vm_flags & VM_MAYSHARE))
> > + return;
> > +
> > + /* Should never get here with non-NULL vm_private_data */
>
> We can get here with non-NULL vm_private_data when called from hugetlb_vm_op_open during fork?
Right!
In fork, We allocate a new semaphore in hugetlb_dup_vma_private, and then
shortly after call hugetlb_vm_op_open.
It works as is, and I can update the comment. However, I wonder if we should
just clear vm_private_data in hugetlb_dup_vma_private and let hugetlb_vm_op_open
do the allocation.
>
> Also there's one missing change on comment:
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d0617d64d718..4bc844a1d312 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -863,7 +863,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
> * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
> * is guaranteed to have their future faults succeed.
> *
> - * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
> + * With the exception of hugetlb_dup_vma_private() which is called at fork(),
> * the reserve counters are updated with the hugetlb_lock held. It is safe
> * to reset the VMA at fork() time as it is not in use yet and there is no
> * chance of the global counters getting corrupted as a result of the values.
>
>
> Otherwise this patch looks good to me. Thanks.
Will update, Thank you!
--
Mike Kravetz
On 08/29/22 15:24, Mike Kravetz wrote:
> On 08/27/22 17:30, Miaohe Lin wrote:
> > On 2022/8/25 1:57, Mike Kravetz wrote:
> > > Allocate a rw semaphore and hang off vm_private_data for
> > > synchronization use by vmas that could be involved in pmd sharing. Only
> > > add infrastructure for the new lock here. Actual use will be added in
> > > subsequent patch.
> > >
> > > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> >
> > <snip>
> >
> > > +static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
> > > +{
> > > + /*
> > > + * Only present in sharable vmas. See comment in
> > > + * __unmap_hugepage_range_final about the neeed to check both
> >
> > s/neeed/need/
> >
> > > + * VM_SHARED and VM_MAYSHARE in free path
> >
> > I think there might be some wrong checks around this patch. As above comment said, we
> > need to check both flags, so we should do something like below instead?
> >
> > if (!(vma->vm_flags & (VM_MAYSHARE | VM_SHARED) == (VM_MAYSHARE | VM_SHARED)))
> >
> > > + */
>
> Thanks. I will update.
>
> > > + if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
> > > + return;
I think you misunderstood the comment which I admit was not very clear. And,
I misunderstood your suggestion. I believe the code is correct as it. Here
is the proposed update comment/code:
/*
* Only present in sharable vmas. See comment in
* __unmap_hugepage_range_final about how VM_SHARED could
* be set without VM_MAYSHARE. As a result, we need to
* check if either is set in the free path.
*/
if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
return;
Hopefully, that makes more sense.
--
Mike Kravetz
On 2022/9/8 4:50, Mike Kravetz wrote:
> On 08/29/22 15:24, Mike Kravetz wrote:
>> On 08/27/22 17:30, Miaohe Lin wrote:
>>> On 2022/8/25 1:57, Mike Kravetz wrote:
>>>> Allocate a rw semaphore and hang off vm_private_data for
>>>> synchronization use by vmas that could be involved in pmd sharing. Only
>>>> add infrastructure for the new lock here. Actual use will be added in
>>>> subsequent patch.
>>>>
>>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>
>>> <snip>
>>>
>>>> +static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
>>>> +{
>>>> + /*
>>>> + * Only present in sharable vmas. See comment in
>>>> + * __unmap_hugepage_range_final about the neeed to check both
>>>
>>> s/neeed/need/
>>>
>>>> + * VM_SHARED and VM_MAYSHARE in free path
>>>
>>> I think there might be some wrong checks around this patch. As above comment said, we
>>> need to check both flags, so we should do something like below instead?
>>>
>>> if (!(vma->vm_flags & (VM_MAYSHARE | VM_SHARED) == (VM_MAYSHARE | VM_SHARED)))
>>>
>>>> + */
>>
>> Thanks. I will update.
>>
>>>> + if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
>>>> + return;
>
> I think you misunderstood the comment which I admit was not very clear. And,
> I misunderstood your suggestion. I believe the code is correct as it. Here
> is the proposed update comment/code:
>
> /*
> * Only present in sharable vmas. See comment in
> * __unmap_hugepage_range_final about how VM_SHARED could
> * be set without VM_MAYSHARE. As a result, we need to
> * check if either is set in the free path.
> */
> if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
> return;
>
> Hopefully, that makes more sense.
Somewhat confusing. Thanks for clarifying, Mike.
Thanks,
Miaohe Lin
On 2022/8/30 6:24, Mike Kravetz wrote:
> On 08/27/22 17:30, Miaohe Lin wrote:
>> On 2022/8/25 1:57, Mike Kravetz wrote:
>>> Allocate a rw semaphore and hang off vm_private_data for
>>> synchronization use by vmas that could be involved in pmd sharing. Only
>>> add infrastructure for the new lock here. Actual use will be added in
>>> subsequent patch.
>>>
>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>
>> <snip>
>>
>>> +static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
>>> +{
>>> + /*
>>> + * Only present in sharable vmas. See comment in
>>> + * __unmap_hugepage_range_final about the neeed to check both
>>
>> s/neeed/need/
>>
>>> + * VM_SHARED and VM_MAYSHARE in free path
>>
>> I think there might be some wrong checks around this patch. As above comment said, we
>> need to check both flags, so we should do something like below instead?
>>
>> if (!(vma->vm_flags & (VM_MAYSHARE | VM_SHARED) == (VM_MAYSHARE | VM_SHARED)))
>>
>>> + */
>
> Thanks. I will update.
>
>>> + if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
>>> + return;
>>> +
>>> + if (vma->vm_private_data) {
>>> + kfree(vma->vm_private_data);
>>> + vma->vm_private_data = NULL;
>>> + }
>>> +}
>>> +
>>> +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
>>> +{
>>> + struct rw_semaphore *vma_sema;
>>> +
>>> + /* Only establish in (flags) sharable vmas */
>>> + if (!vma || !(vma->vm_flags & VM_MAYSHARE))
>>> + return;
>>> +
>>> + /* Should never get here with non-NULL vm_private_data */
>>
>> We can get here with non-NULL vm_private_data when called from hugetlb_vm_op_open during fork?
>
> Right!
>
> In fork, We allocate a new semaphore in hugetlb_dup_vma_private, and then
> shortly after call hugetlb_vm_op_open.
>
> It works as is, and I can update the comment. However, I wonder if we should
> just clear vm_private_data in hugetlb_dup_vma_private and let hugetlb_vm_op_open
> do the allocation.
I think it's a good idea. We can also avoid allocating memory for vma_lock (via clear_vma_resv_huge_pages()) and
then free the corresponding vma right away (via do_munmap())in move_vma(). But maybe I'm miss something.
Thanks,
Miaohe Lin
>
>>
>> Also there's one missing change on comment:
>>
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index d0617d64d718..4bc844a1d312 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -863,7 +863,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
>> * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
>> * is guaranteed to have their future faults succeed.
>> *
>> - * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
>> + * With the exception of hugetlb_dup_vma_private() which is called at fork(),
>> * the reserve counters are updated with the hugetlb_lock held. It is safe
>> * to reset the VMA at fork() time as it is not in use yet and there is no
>> * chance of the global counters getting corrupted as a result of the values.
>>
>>
>> Otherwise this patch looks good to me. Thanks.
>
> Will update, Thank you!
>
© 2016 - 2026 Red Hat, Inc.