Multigrain timestamps allow VFS to request fine-grained timestamps when
there has been recent interest in the file's attributes. Unfortunately,
the coarse-grained timestamps can lag the fine-grained ones. A file
stamped with a coarse-grained timestamp after one with a fine-grained
one can appear to have been modified in reverse order. This is
problematic for programs like "make" or "rsync" which depend on being
able to compare timestamps between two different inodes.
One way to prevent this is to ensure that when we stamp a file with a
fine-grained timestamp, that we use that value to establish a floor for
any later timestamp update.
Add a new mg_nsec field to the timekeeper structure for tracking the
nsec value of the most recent multigrain timestamp, along with two new
helper functions for fetching coarse and fine grained timestamps. When
getting a fine-grained timestamp update the mg_nsec with the value that
was fetched via timekeeping_get_ns.
When fetching a coarse-grained timestamp, set the mg_nsec to
TK_MG_NSEC_IGNORE. When fetching a coarse-grained time for a multigrain
timestamp, apply the mg_nsec value if it's not set to TK_MG_NSEC_IGNORE.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
include/linux/timekeeper_internal.h | 2 +
include/linux/timekeeping.h | 4 ++
kernel/time/timekeeping.c | 80 +++++++++++++++++++++++++++++++++++++
3 files changed, 86 insertions(+)
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 84ff2844df2a..6583b06e7d08 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -55,6 +55,7 @@ struct tk_read_base {
* @tai_offset: The current UTC to TAI offset in seconds
* @clock_was_set_seq: The sequence number of clock was set events
* @cs_was_changed_seq: The sequence number of clocksource change events
+ * @mg_nsec: Nanosecond delta for multigrain timestamps
* @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second
* @raw_sec: CLOCK_MONOTONIC_RAW time in seconds
* @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
@@ -110,6 +111,7 @@ struct timekeeper {
u64 xtime_interval;
s64 xtime_remainder;
u64 raw_interval;
+ u64 mg_nsec;
/* The ntp_tick_length() value currently being used.
* This cached copy ensures we consistently apply the tick
* length for an entire tick, as ntp_tick_length may change
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index fe1e467ba046..5dc0ad619d42 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -44,6 +44,10 @@ extern void ktime_get_real_ts64(struct timespec64 *tv);
extern void ktime_get_coarse_ts64(struct timespec64 *ts);
extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
+/* multigrain timestamp support */
+void ktime_get_mg_fine_ts64(struct timespec64 *ts);
+void ktime_get_mg_coarse_ts64(struct timespec64 *ts);
+
void getboottime64(struct timespec64 *ts);
/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 266d02809dbb..7c20c98b1ea8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -33,6 +33,9 @@
#define TK_MIRROR (1 << 1)
#define TK_CLOCK_WAS_SET (1 << 2)
+/* When mg_nsec is set to this, ignore it */
+#define TK_MG_NSEC_IGNORE (~(0ULL))
+
enum timekeeping_adv_mode {
/* Update timekeeper when a tick has passed */
TK_ADV_TICK,
@@ -139,6 +142,7 @@ static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
+ tk->mg_nsec = TK_MG_NSEC_IGNORE;
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
@@ -146,6 +150,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
tk->xtime_sec += ts->tv_sec;
tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
+ tk->mg_nsec = TK_MG_NSEC_IGNORE;
}
static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
@@ -1664,6 +1669,7 @@ void __init timekeeping_init(void)
tk_setup_internals(tk, clock);
tk_set_xtime(tk, &wall_time);
+ tk->mg_nsec = TK_MG_NSEC_IGNORE;
tk->raw_sec = 0;
tk_set_wall_to_mono(tk, wall_to_mono);
@@ -2283,6 +2289,80 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);
+/**
+ * ktime_get_mg_fine_ts64 - Returns a fine-grained time of day in a timespec64
+ * @ts: pointer to the timespec64 to be set
+ *
+ * Multigrain filesystems use a scheme where they use coarse-grained
+ * timestamps most of the time, but will use a fine-grained timestamp
+ * if the previous timestamp was queried and the coarse-grained clock
+ * hasn't ticked yet.
+ *
+ * For this case, the caller is requesting a fine-grained timestamp,
+ * so we must update the sliding mg_nsec value before returning it. This
+ * ensures that any coarse-grained timestamp updates that occur after
+ * this won't appear to have occurred before.
+ *
+ * Fills in @ts with the current fine-grained time of day, suitable for
+ * a file timestamp.
+ */
+void ktime_get_mg_fine_ts64(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long flags;
+ u32 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+
+ ts->tv_sec = tk->xtime_sec;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ tk->mg_nsec = nsecs;
+
+ write_seqcount_end(&tk_core.seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsecs);
+}
+
+/**
+ * ktime_get_mg_coarse_ts64 - Returns a coarse-grained time of day in a timespec64
+ * @ts: pointer to the timespec64 to be set
+ *
+ * Multigrain filesystems use a scheme where they use coarse-grained
+ * timestamps most of the time, but will use a fine-grained timestamp
+ * if the previous timestamp was queried and the coarse-grained clock
+ * hasn't ticked yet.
+ *
+ * For this case, the caller is requesting a coarse-grained timestamp,
+ * which will always be equal to or later than the latest fine-grained
+ * timestamp that has been handed out.
+ *
+ * Fills in @ts with the current coarse-grained time of day, suitable
+ * for a file timestamp.
+ */
+void ktime_get_mg_coarse_ts64(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ u64 nsec;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ *ts = tk_xtime(tk);
+ nsec = tk->mg_nsec;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ if (nsec != TK_MG_NSEC_IGNORE) {
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsec);
+ }
+}
+
/*
* Must hold jiffies_lock
*/
--
2.41.0
Jeff!
On Wed, Oct 18 2023 at 13:41, Jeff Layton wrote:
> +void ktime_get_mg_fine_ts64(struct timespec64 *ts)
> +{
> + struct timekeeper *tk = &tk_core.timekeeper;
> + unsigned long flags;
> + u32 nsecs;
> +
> + WARN_ON(timekeeping_suspended);
> +
> + raw_spin_lock_irqsave(&timekeeper_lock, flags);
> + write_seqcount_begin(&tk_core.seq);
Depending on the usage scenario, this will end up as a scalability issue
which affects _all_ of timekeeping.
The usage of timekeeper_lock and the sequence count has been carefully
crafted to be as non-contended as possible. We went a great length to
optimize that because the ktime_get*() functions are really hotpath all
over the place.
Exposing such an interface which wreckages that is a recipe for disaster
down the road. It might be a non-issue today, but once we hit the
bottleneck of that global lock, we are up the creek without a
paddle. Well not really, but all we can do then is fall back to
ktime_get_real(). So let me ask the obvious question:
Why don't we do that right away?
Many moons ago when we added ktime_get_real_coarse() the main reason was
that reading the time from the underlying hardware was insanely
expensive.
Many moons later this is not true anymore, except for the stupid case
where the BIOS wreckaged the TSC, but that's a hopeless case for
performance no matter what. Optimizing for that would be beyond stupid.
I'm well aware that ktime_get_real_coarse() is still faster than
ktime_get_real() in micro-benchmarks, i.e. 5ns vs. 15ns on the four
years old laptop I'm writing this.
Many moons ago it was in the ballpark of 40ns vs. 5us due to TSC being
useless and even TSC read was way more expensive (factor 8-10x IIRC) in
comparison. That really mattered for FS, but does todays overhead still
make a difference in the real FS use case scenario?
I'm not in the position of running meaningful FS benchmarks to analyze
that, but I think the delta between ktime_get_real_coarse() and
ktime_get_real() on contemporary hardware is small enough that it
justifies this question.
The point is that both functions have pretty much the same D-cache
pattern because they access the same data in the very same
cacheline. The only difference is the actual TSC read and the extra
conversion, but that's it. The TSC read has been massively optimized by
the CPU vendors. I know that the ARM64 counter has been optimized too,
though I have no idea about PPC64 and S390, but I would be truly
surprised if they didn't optimize the hell out of it because time read
is really used heavily both in kernel and user space.
Does anyone have numbers on contemporary hardware to shed some light on
that in the context of FS and the problem at hand?
Thanks,
tglx
On Fri, 2023-10-20 at 00:00 +0200, Thomas Gleixner wrote:
> Jeff!
>
> On Wed, Oct 18 2023 at 13:41, Jeff Layton wrote:
> > +void ktime_get_mg_fine_ts64(struct timespec64 *ts)
> > +{
> > + struct timekeeper *tk = &tk_core.timekeeper;
> > + unsigned long flags;
> > + u32 nsecs;
> > +
> > + WARN_ON(timekeeping_suspended);
> > +
> > + raw_spin_lock_irqsave(&timekeeper_lock, flags);
> > + write_seqcount_begin(&tk_core.seq);
>
> Depending on the usage scenario, this will end up as a scalability issue
> which affects _all_ of timekeeping.
>
> The usage of timekeeper_lock and the sequence count has been carefully
> crafted to be as non-contended as possible. We went a great length to
> optimize that because the ktime_get*() functions are really hotpath all
> over the place.
>
> Exposing such an interface which wreckages that is a recipe for disaster
> down the road. It might be a non-issue today, but once we hit the
> bottleneck of that global lock, we are up the creek without a
> paddle.
>
Thanks for taking the explanation, Thomas. That's understandable, and
that was my main worry with this set. I'll look at doing this another
way given your feedback. I just started by plumbing this into the
timekeeping code since that seemed like the most obvious place to do it.
I think it's probably still possible to do this by caching the values
returned by the timekeeper at the vfs layer, but there seems to be some
reticence to the basic idea that I don't quite understand yet.
> Well not really, but all we can do then is fall back to
> ktime_get_real(). So let me ask the obvious question:
>
> Why don't we do that right away?
>
> Many moons ago when we added ktime_get_real_coarse() the main reason was
> that reading the time from the underlying hardware was insanely
> expensive.
>
> Many moons later this is not true anymore, except for the stupid case
> where the BIOS wreckaged the TSC, but that's a hopeless case for
> performance no matter what. Optimizing for that would be beyond stupid.
>
> I'm well aware that ktime_get_real_coarse() is still faster than
> ktime_get_real() in micro-benchmarks, i.e. 5ns vs. 15ns on the four
> years old laptop I'm writing this.
>
> Many moons ago it was in the ballpark of 40ns vs. 5us due to TSC being
> useless and even TSC read was way more expensive (factor 8-10x IIRC) in
> comparison. That really mattered for FS, but does todays overhead still
> make a difference in the real FS use case scenario?
>
> I'm not in the position of running meaningful FS benchmarks to analyze
> that, but I think the delta between ktime_get_real_coarse() and
> ktime_get_real() on contemporary hardware is small enough that it
> justifies this question.
>
> The point is that both functions have pretty much the same D-cache
> pattern because they access the same data in the very same
> cacheline. The only difference is the actual TSC read and the extra
> conversion, but that's it. The TSC read has been massively optimized by
> the CPU vendors. I know that the ARM64 counter has been optimized too,
> though I have no idea about PPC64 and S390, but I would be truly
> surprised if they didn't optimize the hell out of it because time read
> is really used heavily both in kernel and user space.
>
> Does anyone have numbers on contemporary hardware to shed some light on
> that in the context of FS and the problem at hand?
That was sort of my suspicion and it's good to have confirmation that
fetching a fine-grained timespec64 from the timekeeper is cheap. It
looked that way when I was poking around in there, but I wasn't sure
whether it was always the case.
It turns out however that the main benefit of using a coarse-grained
timestamp is that it allows the file system to skip a lot of inode
metadata updates.
The way it works today is that when we go to update the timestamp on an
inode, we check whether they have made any visible change, and we dirty
the inode metadata if so. This means that we only really update the
inode on disk once per jiffy or so when an inode is under heavy writes.
The idea with this set is to only use fine-grained timestamps when
someone is actively fetching them via getattr. When the mtime or ctime
is viewed via getattr, we mark the inode and then the following
timestamp update will get a fine-grained timestamp (unless the coarse-
grained clock has already ticked).
That allows us to keep the number of inode updates down to a bare
minimum, but still allows an observer to always see a change in the
timestamp when there have been changes to the inode.
Thanks again for the review! For the next iteration I (probably) won't
need to touch the timekeeper.
--
Jeff Layton <jlayton@kernel.org>
On Wed, 18 Oct 2023 at 10:41, Jeff Layton <jlayton@kernel.org> wrote:
>
> One way to prevent this is to ensure that when we stamp a file with a
> fine-grained timestamp, that we use that value to establish a floor for
> any later timestamp update.
I'm very leery of this.
I don't like how it's using a global time - and a global fine-grained
offset - when different filesystems will very naturally have different
granularities. I also don't like how it's no using that global lock.
Yes, yes, since the use of this all is then gated by the 'is_mgtime()'
thing, any filesystem with big granularities will presumably never set
FS_MGTIME in the first time, and that hides the worst pointless cases.
But it still feels iffy to me.
Also, the whole current_ctime() logic seems wrong. Later (in 4/9), you do this:
static struct timespec64 current_ctime(struct inode *inode)
{
if (is_mgtime(inode))
return current_mgtime(inode);
and current_mgtime() does
if (nsec & I_CTIME_QUERIED) {
ktime_get_real_ts64(&now);
return timestamp_truncate(now, inode);
}
so once the code has set I_CTIME_QUERIED, it will now use the
expensive fine-grained time - even when it makes no sense.
As far as I can tell, there is *never* a reason to get the
fine-grained time if the old inode ctime is already sufficiently far
away.
IOW, my gut feel is that all this logic should always not only be
guarded by FS_MGTIME (like you do seem to do), *and* by "has anybody
even queried this time" - it should *also* always be guarded by "if I
get the coarse-grained time, is that sufficient?"
So I think the current_ctime() logic should be something along the lines of
static struct timespec64 current_ctime(struct inode *inode)
{
struct timespec64 ts64 = current_time(inode);
unsigned long old_ctime_sec = inode->i_ctime_sec;
unsigned int old_ctime_nsec = inode->i_ctime_nsec;
if (ts64.tv_sec != old_ctime_sec)
return ts64;
/*
* No need to check is_mgtime(inode) - the I_CTIME_QUERIED
* flag is only set for mgtime filesystems
*/
if (!(old_ctime_nsec & I_CTIME_QUERIED))
return ts64;
old_ctime_nsec &= ~I_CTIME_QUERIED;
if (ts64.tv_nsec > old_ctime_nsec + inode->i_sb->s_time_gran)
return ts64;
/* Ok, only *now* do we do a finegrained value */
ktime_get_real_ts64(&ts64);
return timestamp_truncate(ts64);
}
or whatever. Make it *very* clear that the finegrained timestamp is
the absolute last option, after we have checked that the regular one
isn't possible.
I dunno.
Linus
On Wed, 2023-10-18 at 12:18 -0700, Linus Torvalds wrote:
> On Wed, 18 Oct 2023 at 10:41, Jeff Layton <jlayton@kernel.org> wrote:
> >
> > One way to prevent this is to ensure that when we stamp a file with a
> > fine-grained timestamp, that we use that value to establish a floor for
> > any later timestamp update.
>
> I'm very leery of this.
>
> I don't like how it's using a global time - and a global fine-grained
> offset - when different filesystems will very naturally have different
> granularities. I also don't like how it's no using that global lock.
>
> Yes, yes, since the use of this all is then gated by the 'is_mgtime()'
> thing, any filesystem with big granularities will presumably never set
> FS_MGTIME in the first time, and that hides the worst pointless cases.
> But it still feels iffy to me.
>
Thanks for taking a look!
I'm not too crazy about the global lock either, but the global fine
grained value ensures that when we have mtime changes that occur across
filesystems that they appear to be in the correct order.
We could (hypothetically) track an offset per superblock or something,
but then you could see out-of-order timestamps in inodes across
different filesystems (even of the same type). I think it'd better not
to do that if we can get away with it.
> Also, the whole current_ctime() logic seems wrong. Later (in 4/9), you do this:
>
> static struct timespec64 current_ctime(struct inode *inode)
> {
> if (is_mgtime(inode))
> return current_mgtime(inode);
>
> and current_mgtime() does
>
> if (nsec & I_CTIME_QUERIED) {
> ktime_get_real_ts64(&now);
> return timestamp_truncate(now, inode);
> }
>
> so once the code has set I_CTIME_QUERIED, it will now use the
> expensive fine-grained time - even when it makes no sense.
>
> As far as I can tell, there is *never* a reason to get the
> fine-grained time if the old inode ctime is already sufficiently far
> away.
>
> IOW, my gut feel is that all this logic should always not only be
> guarded by FS_MGTIME (like you do seem to do), *and* by "has anybody
> even queried this time" - it should *also* always be guarded by "if I
> get the coarse-grained time, is that sufficient?"
>
> So I think the current_ctime() logic should be something along the lines of
>
> static struct timespec64 current_ctime(struct inode *inode)
> {
> struct timespec64 ts64 = current_time(inode);
> unsigned long old_ctime_sec = inode->i_ctime_sec;
> unsigned int old_ctime_nsec = inode->i_ctime_nsec;
>
> if (ts64.tv_sec != old_ctime_sec)
> return ts64;
>
> /*
> * No need to check is_mgtime(inode) - the I_CTIME_QUERIED
> * flag is only set for mgtime filesystems
> */
> if (!(old_ctime_nsec & I_CTIME_QUERIED))
> return ts64;
> old_ctime_nsec &= ~I_CTIME_QUERIED;
> if (ts64.tv_nsec > old_ctime_nsec + inode->i_sb->s_time_gran)
> return ts64;
>
Does that really do what you expect here? current_time will return a
value that has already been through timestamp_truncate. Regardless, I
don't think this function makes as big a difference as you might think.
>
> /* Ok, only *now* do we do a finegrained value */
> ktime_get_real_ts64(&ts64);
> return timestamp_truncate(ts64);
> }
>
> or whatever. Make it *very* clear that the finegrained timestamp is
> the absolute last option, after we have checked that the regular one
> isn't possible.
current_mgtime is calling ktime_get_real_ts64, which is an existing
interface that does not take the global spinlock and won't advance the
global offset. That call should be quite cheap.
The reason we can use that call here is because current_ctime and
current_mgtime are only called from inode_needs_update_time, which is
only called to check whether we need to get write access to the
inode. What we do is look at the current clock and see whether the
timestamps would perceivably change if we were to do the update right
then.
If so, we get write access and then call inode_set_ctime_current(). That
will fetch its own timestamps and reevaluate what sort of update to do.
That's the only place that fetches an expensive fine-grained timestamp
that advances the offset.
So, I think this set already is only getting the expensive fine-grained
timestamps as a last resort.
This is probably an indicator that I need to document this code better
though. It may also be a good idea to reorganize
inode_needs_update_time, current_ctime and current_mgtime for better
clarity.
Many thanks for the comments!
--
Jeff Layton <jlayton@kernel.org>
On Wed, 18 Oct 2023 at 13:47, Jeff Layton <jlayton@kernel.org> wrote:
>
> > old_ctime_nsec &= ~I_CTIME_QUERIED;
> > if (ts64.tv_nsec > old_ctime_nsec + inode->i_sb->s_time_gran)
> > return ts64;
> >
>
> Does that really do what you expect here? current_time will return a
> value that has already been through timestamp_truncate.
Yeah, you're right.
I think you can actually remove the s_time_gran addition. Both the
old_ctime_nsec and the current ts64,tv_nsec are already rounded, so
just doing
if (ts64.tv_nsec > old_ctime_nsec)
return ts64;
would already guarantee that it's different enough.
> current_mgtime is calling ktime_get_real_ts64, which is an existing
> interface that does not take the global spinlock and won't advance the
> global offset. That call should be quite cheap.
Ahh, I did indeed mis-read that as the new one with the lock.
I did react to the fact that is_mgtime(inode) itself is horribly
expensive if it's not cached (following three quite possibly cold
pointers), which was part of that whole "look at I_CTIME_QUERIED
instead".
I see the pointer chasing as a huge VFS timesink in all my profiles,
although usually it's the disgusting security pointer (because selinux
creates separate security nodes for each inode, even when the contents
are often identical). So I'm a bit sensitive to "follow several
pointers from 'struct inode'" patterns from looking at too many
instruction profiles.
Linus
On Wed, 2023-10-18 at 14:31 -0700, Linus Torvalds wrote: > On Wed, 18 Oct 2023 at 13:47, Jeff Layton <jlayton@kernel.org> wrote: > > > > > old_ctime_nsec &= ~I_CTIME_QUERIED; > > > if (ts64.tv_nsec > old_ctime_nsec + inode->i_sb->s_time_gran) > > > return ts64; > > > > > > > Does that really do what you expect here? current_time will return a > > value that has already been through timestamp_truncate. > > Yeah, you're right. > > I think you can actually remove the s_time_gran addition. Both the > old_ctime_nsec and the current ts64,tv_nsec are already rounded, so > just doing > > if (ts64.tv_nsec > old_ctime_nsec) > return ts64; > > would already guarantee that it's different enough. > Yep, and that's basically what inode_set_ctime_current does (though it does a timespec64 comparison). > > current_mgtime is calling ktime_get_real_ts64, which is an existing > > interface that does not take the global spinlock and won't advance the > > global offset. That call should be quite cheap. > > Ahh, I did indeed mis-read that as the new one with the lock. > > I did react to the fact that is_mgtime(inode) itself is horribly > expensive if it's not cached (following three quite possibly cold > pointers), which was part of that whole "look at I_CTIME_QUERIED > instead". > > I see the pointer chasing as a huge VFS timesink in all my profiles, > although usually it's the disgusting security pointer (because selinux > creates separate security nodes for each inode, even when the contents > are often identical). So I'm a bit sensitive to "follow several > pointers from 'struct inode'" patterns from looking at too many > instruction profiles. That's a very good point. I'll see if I can get rid of that (and maybe some other mgtime flag checks) before I send the next version. Back to your earlier point though: Is a global offset really a non-starter? I can see about doing something per-superblock, but ktime_get_mg_coarse_ts64 should be roughly as cheap as ktime_get_coarse_ts64. I don't see the downside there for the non- multigrain filesystems to call that. On another note: maybe I need to put this behind a Kconfig option initially too? -- Jeff Layton <jlayton@kernel.org>
> Back to your earlier point though: > > Is a global offset really a non-starter? I can see about doing something > per-superblock, but ktime_get_mg_coarse_ts64 should be roughly as cheap > as ktime_get_coarse_ts64. I don't see the downside there for the non- > multigrain filesystems to call that. I have to say that this doesn't excite me. This whole thing feels a bit hackish. I think that a change version is the way more sane way to go. > > On another note: maybe I need to put this behind a Kconfig option > initially too? So can we for a second consider not introducing fine-grained timestamps at all. We let NFSv3 live with the cache problem it's been living with forever. And for NFSv4 we actually do introduce a proper i_version for all filesystems that matter to it. What filesystems exactly don't expose a proper i_version and what does prevent them from adding one or fixing it?
On Thu, 2023-10-19 at 11:29 +0200, Christian Brauner wrote: > > Back to your earlier point though: > > > > Is a global offset really a non-starter? I can see about doing something > > per-superblock, but ktime_get_mg_coarse_ts64 should be roughly as cheap > > as ktime_get_coarse_ts64. I don't see the downside there for the non- > > multigrain filesystems to call that. > > I have to say that this doesn't excite me. This whole thing feels a bit > hackish. I think that a change version is the way more sane way to go. > What is it about this set that feels so much more hackish to you? Most of this set is pretty similar to what we had to revert. Is it just the timekeeper changes? Why do you feel those are a problem? > > > > On another note: maybe I need to put this behind a Kconfig option > > initially too? > > So can we for a second consider not introducing fine-grained timestamps > at all. We let NFSv3 live with the cache problem it's been living with > forever. > > And for NFSv4 we actually do introduce a proper i_version for all > filesystems that matter to it. > > What filesystems exactly don't expose a proper i_version and what does > prevent them from adding one or fixing it? Certainly we can drop this series altogether if that's the consensus. The main exportable filesystem that doesn't have a suitable change counter now is XFS. Fixing it will require an on-disk format change to accommodate a new version counter that doesn't increment on atime updates. This is something the XFS folks were specifically looking to avoid, but maybe that's the simpler option. There is also bcachefs which I don't think has a change attr yet. They'd also likely need a on-disk format change, but hopefully that's a easier thing to do there since it's a brand new filesystem. There are a smattering of lesser-used local filesystems (f2fs, nilfs2, etc.) that have no i_version support. Multigrain timestamps would make it simple to add better change attribute support there, but they can (in principle) all undergo an on-disk format change too if they decide to add one. Then there are filesystems like ntfs that are exportable, but where we can't extend the on-disk format. Those could probably benefit from multigrain timestamps, but those are much lower priority. Not many people sharing their NTFS filesystem via NFS anyway. -- Jeff Layton <jlayton@kernel.org>
On Thu, Oct 19, 2023 at 07:28:48AM -0400, Jeff Layton wrote: > On Thu, 2023-10-19 at 11:29 +0200, Christian Brauner wrote: > > > Back to your earlier point though: > > > > > > Is a global offset really a non-starter? I can see about doing something > > > per-superblock, but ktime_get_mg_coarse_ts64 should be roughly as cheap > > > as ktime_get_coarse_ts64. I don't see the downside there for the non- > > > multigrain filesystems to call that. > > > > I have to say that this doesn't excite me. This whole thing feels a bit > > hackish. I think that a change version is the way more sane way to go. > > > > What is it about this set that feels so much more hackish to you? Most > of this set is pretty similar to what we had to revert. Is it just the > timekeeper changes? Why do you feel those are a problem? So I think that the multi-grain timestamp work was well intended but it was ultimately a mistake. Because we added code that complicated timestamp timestamp handling in the vfs to a point where the costs clearly outweighed the benefits. And I don't think that this direction is worth going into. This whole thread ultimately boils down to complicating generic infrastructure quite extensively for nfs to handle exposing xfs without forcing an on-disk format change. That's even fine. That's not a problem but in the same way I don't think the solution is just stuffing this complexity into the vfs. IOW, if we make this a vfs problem then at the lowest possible cost and not by changing how timestamps work for everyone even if it's just internal.
On Tue, 2023-10-31 at 11:26 +0100, Christian Brauner wrote: > On Thu, Oct 19, 2023 at 07:28:48AM -0400, Jeff Layton wrote: > > On Thu, 2023-10-19 at 11:29 +0200, Christian Brauner wrote: > > > > Back to your earlier point though: > > > > > > > > Is a global offset really a non-starter? I can see about doing something > > > > per-superblock, but ktime_get_mg_coarse_ts64 should be roughly as cheap > > > > as ktime_get_coarse_ts64. I don't see the downside there for the non- > > > > multigrain filesystems to call that. > > > > > > I have to say that this doesn't excite me. This whole thing feels a bit > > > hackish. I think that a change version is the way more sane way to go. > > > > > > > What is it about this set that feels so much more hackish to you? Most > > of this set is pretty similar to what we had to revert. Is it just the > > timekeeper changes? Why do you feel those are a problem? > > So I think that the multi-grain timestamp work was well intended but it > was ultimately a mistake. Because we added code that complicated > timestamp timestamp handling in the vfs to a point where the costs > clearly outweighed the benefits. > > And I don't think that this direction is worth going into. This whole > thread ultimately boils down to complicating generic infrastructure > quite extensively for nfs to handle exposing xfs without forcing an > on-disk format change. That's even fine. > > That's not a problem but in the same way I don't think the solution is > just stuffing this complexity into the vfs. IOW, if we make this a vfs > problem then at the lowest possible cost and not by changing how > timestamps work for everyone even if it's just internal. I'll point out that this last posting I did was an RFC. It was invasive to the timekeeping code, but I don't think that's a hard requirement for doing this. I do appreciate the feedback on this version of the series (particularly from Thomas who gave a great technical reason why this approach was wrong), but I don't think we necessarily have to give up on the whole idea because this particular implementation was too costly. The core idea for fixing the problem with the original series is sane, IMO. There is nothing wrong with simply making it that when we stamp a file with a fine-grained timestamp that we consider that a floor for all later timestamp updates. The only real question is how to keep that (global) fine-grained floor offset at a low cost. I think that's a solvable problem. I also believe that real, measurable fine-grained timestamp differences are worthwhile for other use cases beyond NFS. Everyone was pointing out the problems with lagging timestamps vs. make and rsync, but that's a double-edged sword. With the current always coarse-grained timestamps, the ordering of files written within the same jiffy can't be determined since their timestamps will be identical. We could conceivably change that with this series. That said, if this has no chance of ever being merged, then I won't bother working on it further, and we can try to pursue something that is (maybe) XFS-specific. Let me know, either way. -- Jeff Layton <jlayton@kernel.org>
On Thu, Oct 19, 2023 at 07:28:48AM -0400, Jeff Layton wrote: > On Thu, 2023-10-19 at 11:29 +0200, Christian Brauner wrote: > > > Back to your earlier point though: > > > > > > Is a global offset really a non-starter? I can see about doing something > > > per-superblock, but ktime_get_mg_coarse_ts64 should be roughly as cheap > > > as ktime_get_coarse_ts64. I don't see the downside there for the non- > > > multigrain filesystems to call that. > > > > I have to say that this doesn't excite me. This whole thing feels a bit > > hackish. I think that a change version is the way more sane way to go. > > > > What is it about this set that feels so much more hackish to you? Most > of this set is pretty similar to what we had to revert. Is it just the > timekeeper changes? Why do you feel those are a problem? > > > > > > > On another note: maybe I need to put this behind a Kconfig option > > > initially too? > > > > So can we for a second consider not introducing fine-grained timestamps > > at all. We let NFSv3 live with the cache problem it's been living with > > forever. > > > > And for NFSv4 we actually do introduce a proper i_version for all > > filesystems that matter to it. > > > > What filesystems exactly don't expose a proper i_version and what does > > prevent them from adding one or fixing it? > > Certainly we can drop this series altogether if that's the consensus. > > The main exportable filesystem that doesn't have a suitable change > counter now is XFS. Fixing it will require an on-disk format change to > accommodate a new version counter that doesn't increment on atime > updates. This is something the XFS folks were specifically looking to > avoid, but maybe that's the simpler option. And now we have travelled the full circle. The problem NFS has with atime updates on XFS is a result of the default behaviour of relatime - it *always* forces a persistent atime update after mtime has changed. Hence a read-after-write operation will trigger an atime update because atime is older than mtime. This is what causes XFS to run a transaction (i.e. a persistent atime update) and that bumps iversion. lazytime does not behave this way - it delays all persistent timestamp updates until the next persistent change or until the lazytime aggregation period expires (24 hours). Hence with lazytime, read-after-write operations do not trigger a persistent atime update, and so XFS does not run a transaction to update atime. Hence i_version does not get bumped, and NFS behaves as expected. IOWs, what the NFS server actually wants from the filesytsems is for lazy timestamp updates to always be used on read operations. It does not want persistent timestamp updates that change on-disk state. The recent "redefinition" of when i_version should change effectively encodes this - i_version should only change when a persistent metadata or data change is made that also changes [cm]time. Hence the simple, in-memory solution to this problem is for NFS to tell the filesysetms that it needs to using lazy (in-memory) atime updates for the given operation rather than persistent atime updates. We already need to modify how atime updates work for io_uring - io_uring needs atime updates to be guaranteed non-blocking similar to updating mtime in the write IO path. If a persistent timestamp change needs to be run, then the timestamp update needs to return -EAGAIN rather than (potentially) blocking so the entire operation can be punted to a context that can block. This requires control flags to be passed to the core atime handling functions. If a filesystem doesn't understand/support the flags, it can just ignore it and do the update however it was going to do it. It won't make anything work incorrectly, just might do something that is not ideal. With this new "non-blocking update only" flag for io_uring and a new "non-persistent update only" flag for NFS, we have a very similar conditional atime update requirements from two completely independent in-kernel applications. IOWs, this can be solved quite simply by having the -application- define the persistence semantics of the operation being performed. Add a RWF_LAZYTIME/IOCB_LAZYTIME flag for read IO that is being issued from the nfs daemon (i.e. passed to vfs_iter_read()) and then the vfs/filesystem can do exactly the right thing for the IO being issued. This is what io_uring does with IOCB_NOWAIT to tell the filesystems that the IO must be non-blocking, and it's the key we already use for non-blocking mtime updates and will use to trigger non-blocking atime updates.... I also know of cases where a per-IO RWF_LAZYTIME flag would be beneficial - large databases are already using lazytime mount options so that their data IO doesn't take persistent mtime update overhead hits on every write IO..... > There is also bcachefs which I don't think has a change attr yet. They'd > also likely need a on-disk format change, but hopefully that's a easier > thing to do there since it's a brand new filesystem. It's not a "brand new filesystem". It's been out there for quite a long while, and it has many users that would be impacted by on-disk format changes at this point in it's life. on-disk format changes are a fairly major deal for filesystems, and if there is any way we can avoid them we should. -Dave. -- Dave Chinner david@fromorbit.com
On Fri, 2023-10-20 at 09:02 +1100, Dave Chinner wrote: > On Thu, Oct 19, 2023 at 07:28:48AM -0400, Jeff Layton wrote: > > On Thu, 2023-10-19 at 11:29 +0200, Christian Brauner wrote: > > > > Back to your earlier point though: > > > > > > > > Is a global offset really a non-starter? I can see about doing something > > > > per-superblock, but ktime_get_mg_coarse_ts64 should be roughly as cheap > > > > as ktime_get_coarse_ts64. I don't see the downside there for the non- > > > > multigrain filesystems to call that. > > > > > > I have to say that this doesn't excite me. This whole thing feels a bit > > > hackish. I think that a change version is the way more sane way to go. > > > > > > > What is it about this set that feels so much more hackish to you? Most > > of this set is pretty similar to what we had to revert. Is it just the > > timekeeper changes? Why do you feel those are a problem? > > > > > > > > > > On another note: maybe I need to put this behind a Kconfig option > > > > initially too? > > > > > > So can we for a second consider not introducing fine-grained timestamps > > > at all. We let NFSv3 live with the cache problem it's been living with > > > forever. > > > > > > And for NFSv4 we actually do introduce a proper i_version for all > > > filesystems that matter to it. > > > > > > What filesystems exactly don't expose a proper i_version and what does > > > prevent them from adding one or fixing it? > > > > Certainly we can drop this series altogether if that's the consensus. > > > > The main exportable filesystem that doesn't have a suitable change > > counter now is XFS. Fixing it will require an on-disk format change to > > accommodate a new version counter that doesn't increment on atime > > updates. This is something the XFS folks were specifically looking to > > avoid, but maybe that's the simpler option. > > And now we have travelled the full circle. > LOL, yes! > The problem NFS has with atime updates on XFS is a result of > the default behaviour of relatime - it *always* forces a persistent > atime update after mtime has changed. Hence a read-after-write > operation will trigger an atime update because atime is older than > mtime. This is what causes XFS to run a transaction (i.e. a > persistent atime update) and that bumps iversion. > Those particular atime updates are not a problem. If we're updating the mtime and ctime anyway, then bumping the change attribute is OK. The problem is that relatime _also_ does an on-disk update once a day for just an atime update. On XFS, this means that the change attribute also gets bumped and the clients invalidate their caches all at once. That doesn't sound like a big problem at first, but what if you're sharing a multi-gigabyte r/o file between multiple clients? This sort of thing is fairly common on render-farm workloads, and all of your clients will end up invalidating their caches once once a day if you're serving from XFS. > lazytime does not behave this way - it delays all persistent > timestamp updates until the next persistent change or until the > lazytime aggregation period expires (24 hours). Hence with lazytime, > read-after-write operations do not trigger a persistent atime > update, and so XFS does not run a transaction to update atime. Hence > i_version does not get bumped, and NFS behaves as expected. > Similar problem here. Once a day, NFS clients will invalidate the cache on any static content served from XFS. > IOWs, what the NFS server actually wants from the filesytsems is for > lazy timestamp updates to always be used on read operations. It does > not want persistent timestamp updates that change on-disk state. The > recent "redefinition" of when i_version should change effectively > encodes this - i_version should only change when a persistent > metadata or data change is made that also changes [cm]time. > > Hence the simple, in-memory solution to this problem is for NFS to > tell the filesysetms that it needs to using lazy (in-memory) atime > updates for the given operation rather than persistent atime updates. > > We already need to modify how atime updates work for io_uring - > io_uring needs atime updates to be guaranteed non-blocking similar > to updating mtime in the write IO path. If a persistent timestamp > change needs to be run, then the timestamp update needs to return > -EAGAIN rather than (potentially) blocking so the entire operation > can be punted to a context that can block. > > This requires control flags to be passed to the core atime handling > functions. If a filesystem doesn't understand/support the flags, it > can just ignore it and do the update however it was going to do it. > It won't make anything work incorrectly, just might do something > that is not ideal. > > With this new "non-blocking update only" flag for io_uring and a > new "non-persistent update only" flag for NFS, we have a very > similar conditional atime update requirements from two completely > independent in-kernel applications. > > IOWs, this can be solved quite simply by having the -application- > define the persistence semantics of the operation being performed. > Add a RWF_LAZYTIME/IOCB_LAZYTIME flag for read IO that is being > issued from the nfs daemon (i.e. passed to vfs_iter_read()) and then > the vfs/filesystem can do exactly the right thing for the IO being > issued. > > This is what io_uring does with IOCB_NOWAIT to tell the filesystems > that the IO must be non-blocking, and it's the key we already use > for non-blocking mtime updates and will use to trigger non-blocking > atime updates.... > > I also know of cases where a per-IO RWF_LAZYTIME flag would be > beneficial - large databases are already using lazytime mount > options so that their data IO doesn't take persistent mtime update > overhead hits on every write IO..... > I don't think that trying to do something "special" for activity that is initiated by the NFS server solves anything. Bear in mind that NFS clients care about locally-initiated activity too. The bottom line is that we don't want to be foisting a cache invalidation on the clients just because someone read a file, or did some similar activity like a readdir or readlink. The lazytime/relatime options may mitigate the problem, but they're not a real solution. What NFS _really_ wants is a proper change counter that doesn't increment on read(like) operations. In practice, that comes down to just not incrementing it on atime updates. btrfs, ext4 and tmpfs have this (now). xfs does not because its change attribute is incremented when an atime update is logged, and that is evidently something that cannot be changed without an on-disk format change. > > There is also bcachefs which I don't think has a change attr yet. They'd > > also likely need a on-disk format change, but hopefully that's a easier > > thing to do there since it's a brand new filesystem. > > It's not a "brand new filesystem". It's been out there for quite a > long while, and it has many users that would be impacted by on-disk > format changes at this point in it's life. on-disk format changes > are a fairly major deal for filesystems, and if there is any way we > can avoid them we should. Sure. It's new to me though. It's also not yet merged into mainline. I'd _really_ like to see a proper change counter added before it's merged, or at least space in the on-disk inode reserved for one until we can get it plumbed in. That would suck for the current users, I suppose, but at least that userbase is small now. Once it's merged, there will be a lot more people using it and it becomes just that much more difficult. I suppose bcachefs could try to hold out for the multigrain timestamp work too, but that may not ever make it in. -- Jeff Layton <jlayton@kernel.org>
On Fri, Oct 20, 2023 at 08:12:45AM -0400, Jeff Layton wrote: > On Fri, 2023-10-20 at 09:02 +1100, Dave Chinner wrote: > > On Thu, Oct 19, 2023 at 07:28:48AM -0400, Jeff Layton wrote: > > > On Thu, 2023-10-19 at 11:29 +0200, Christian Brauner wrote: > > > > > Back to your earlier point though: > > > > > > > > > > Is a global offset really a non-starter? I can see about doing something > > > > > per-superblock, but ktime_get_mg_coarse_ts64 should be roughly as cheap > > > > > as ktime_get_coarse_ts64. I don't see the downside there for the non- > > > > > multigrain filesystems to call that. > > > > > > > > I have to say that this doesn't excite me. This whole thing feels a bit > > > > hackish. I think that a change version is the way more sane way to go. > > > > > > > > > > What is it about this set that feels so much more hackish to you? Most > > > of this set is pretty similar to what we had to revert. Is it just the > > > timekeeper changes? Why do you feel those are a problem? > > > > > > > > > > > > > On another note: maybe I need to put this behind a Kconfig option > > > > > initially too? > > > > > > > > So can we for a second consider not introducing fine-grained timestamps > > > > at all. We let NFSv3 live with the cache problem it's been living with > > > > forever. > > > > > > > > And for NFSv4 we actually do introduce a proper i_version for all > > > > filesystems that matter to it. > > > > > > > > What filesystems exactly don't expose a proper i_version and what does > > > > prevent them from adding one or fixing it? > > > > > > Certainly we can drop this series altogether if that's the consensus. > > > > > > The main exportable filesystem that doesn't have a suitable change > > > counter now is XFS. Fixing it will require an on-disk format change to > > > accommodate a new version counter that doesn't increment on atime > > > updates. This is something the XFS folks were specifically looking to > > > avoid, but maybe that's the simpler option. > > > > And now we have travelled the full circle. > > > > LOL, yes! > > > The problem NFS has with atime updates on XFS is a result of > > the default behaviour of relatime - it *always* forces a persistent > > atime update after mtime has changed. Hence a read-after-write > > operation will trigger an atime update because atime is older than > > mtime. This is what causes XFS to run a transaction (i.e. a > > persistent atime update) and that bumps iversion. > > > > Those particular atime updates are not a problem. If we're updating the > mtime and ctime anyway, then bumping the change attribute is OK. > > The problem is that relatime _also_ does an on-disk update once a day > for just an atime update. On XFS, this means that the change attribute > also gets bumped and the clients invalidate their caches all at once. > > That doesn't sound like a big problem at first, but what if you're > sharing a multi-gigabyte r/o file between multiple clients? This sort of > thing is fairly common on render-farm workloads, and all of your clients > will end up invalidating their caches once once a day if you're serving > from XFS. So we have noatime inode and mount options for such specialised workloads that cannot tolerate cached ever being invalidated, yes? > > lazytime does not behave this way - it delays all persistent > > timestamp updates until the next persistent change or until the > > lazytime aggregation period expires (24 hours). Hence with lazytime, > > read-after-write operations do not trigger a persistent atime > > update, and so XFS does not run a transaction to update atime. Hence > > i_version does not get bumped, and NFS behaves as expected. > > > > Similar problem here. Once a day, NFS clients will invalidate the cache > on any static content served from XFS. Lazytime has /proc/sys/vm/dirtytime_expire_seconds to change the interval that triggers persistent time changes. That could easily be configured to be longer than a day for workloads that care about this sort of thing. Indeed, we could just set up timestamps that NFS says "do not make persistent" to only be persisted when the inode is removed from server memory rather than be timed out by background writeback.... ----- All I'm suggesting is that rather than using mount options for noatime-like behaviour for NFSD accesses, we actually have the nfsd accesses say "we'd like pure atime updates without iversion, please". Keep in mind that XFS does actually try to avoid bumping i_version on pure timestamp updates - we carved that out a long time ago (see the difference in XFS_ILOG_CORE vs XFS_ILOG_TIMESTAMP in xfs_vn_update_time() and xfs_trans_log_inode()) so that we could optimise fdatasync() to ignore timestamp updates that occur as a result of pure data overwrites. Hence XFS only bumps i_version for pure timestamp updates if the iversion queried flag is set. IOWs, XFS it is actually doing exactly what the VFS iversion implementation is telling it to do with timestamp updates for non-core inode metadata updates. That's the fundamental issue here: nfsd has set VFS state that tells the filesystem to "bump iversion on next persistent inode change", but the nfsd then runs operations that can change non-critical persistent inode state in "query-only" operations. It then expects filesystems to know that it should ignore the iversion queried state within this context. However, without external behavioural control flags, filesystems cannot know that an isolated metadata update has context specific iversion behavioural constraints. Hence fixing this is purely a VFS/nfsd i_version implementation problem - if the nfsd is running a querying operation, it should tell the filesystem that it should ignore iversion query state. If nothing the application level cache cares about is being changed during the query operation, it should tell the filesystem to ignore iversion query state because it is likely the nfsd query itself will set it (or have already set it itself in the case of compound operations). This does not need XFS on-disk format changes to fix. This does not need changes to timestamp infrastructure to fix. We just need the nfsd application to tell us that we should ignore the vfs i_version query state when we update non-core inode metadata within query operation contexts. -Dave. -- Dave Chinner david@fromorbit.com
On Mon, 2023-10-23 at 09:17 +1100, Dave Chinner wrote: > On Fri, Oct 20, 2023 at 08:12:45AM -0400, Jeff Layton wrote: > > On Fri, 2023-10-20 at 09:02 +1100, Dave Chinner wrote: > > > On Thu, Oct 19, 2023 at 07:28:48AM -0400, Jeff Layton wrote: > > > > On Thu, 2023-10-19 at 11:29 +0200, Christian Brauner wrote: > > > > > > Back to your earlier point though: > > > > > > > > > > > > Is a global offset really a non-starter? I can see about doing something > > > > > > per-superblock, but ktime_get_mg_coarse_ts64 should be roughly as cheap > > > > > > as ktime_get_coarse_ts64. I don't see the downside there for the non- > > > > > > multigrain filesystems to call that. > > > > > > > > > > I have to say that this doesn't excite me. This whole thing feels a bit > > > > > hackish. I think that a change version is the way more sane way to go. > > > > > > > > > > > > > What is it about this set that feels so much more hackish to you? Most > > > > of this set is pretty similar to what we had to revert. Is it just the > > > > timekeeper changes? Why do you feel those are a problem? > > > > > > > > > > > > > > > > On another note: maybe I need to put this behind a Kconfig option > > > > > > initially too? > > > > > > > > > > So can we for a second consider not introducing fine-grained timestamps > > > > > at all. We let NFSv3 live with the cache problem it's been living with > > > > > forever. > > > > > > > > > > And for NFSv4 we actually do introduce a proper i_version for all > > > > > filesystems that matter to it. > > > > > > > > > > What filesystems exactly don't expose a proper i_version and what does > > > > > prevent them from adding one or fixing it? > > > > > > > > Certainly we can drop this series altogether if that's the consensus. > > > > > > > > The main exportable filesystem that doesn't have a suitable change > > > > counter now is XFS. Fixing it will require an on-disk format change to > > > > accommodate a new version counter that doesn't increment on atime > > > > updates. This is something the XFS folks were specifically looking to > > > > avoid, but maybe that's the simpler option. > > > > > > And now we have travelled the full circle. > > > > > > > LOL, yes! > > > > > The problem NFS has with atime updates on XFS is a result of > > > the default behaviour of relatime - it *always* forces a persistent > > > atime update after mtime has changed. Hence a read-after-write > > > operation will trigger an atime update because atime is older than > > > mtime. This is what causes XFS to run a transaction (i.e. a > > > persistent atime update) and that bumps iversion. > > > > > > > Those particular atime updates are not a problem. If we're updating the > > mtime and ctime anyway, then bumping the change attribute is OK. > > > > The problem is that relatime _also_ does an on-disk update once a day > > for just an atime update. On XFS, this means that the change attribute > > also gets bumped and the clients invalidate their caches all at once. > > > > That doesn't sound like a big problem at first, but what if you're > > sharing a multi-gigabyte r/o file between multiple clients? This sort of > > thing is fairly common on render-farm workloads, and all of your clients > > will end up invalidating their caches once once a day if you're serving > > from XFS. > > So we have noatime inode and mount options for such specialised > workloads that cannot tolerate cached ever being invalidated, yes? > > > > lazytime does not behave this way - it delays all persistent > > > timestamp updates until the next persistent change or until the > > > lazytime aggregation period expires (24 hours). Hence with lazytime, > > > read-after-write operations do not trigger a persistent atime > > > update, and so XFS does not run a transaction to update atime. Hence > > > i_version does not get bumped, and NFS behaves as expected. > > > > > > > Similar problem here. Once a day, NFS clients will invalidate the cache > > on any static content served from XFS. > > Lazytime has /proc/sys/vm/dirtytime_expire_seconds to change the > interval that triggers persistent time changes. That could easily be > configured to be longer than a day for workloads that care about > this sort of thing. Indeed, we could just set up timestamps that NFS > says "do not make persistent" to only be persisted when the inode is > removed from server memory rather than be timed out by background > writeback.... > > ----- > > All I'm suggesting is that rather than using mount options for > noatime-like behaviour for NFSD accesses, we actually have the nfsd > accesses say "we'd like pure atime updates without iversion, please". > > Keep in mind that XFS does actually try to avoid bumping i_version > on pure timestamp updates - we carved that out a long time ago (see > the difference in XFS_ILOG_CORE vs XFS_ILOG_TIMESTAMP in > xfs_vn_update_time() and xfs_trans_log_inode()) so that we could > optimise fdatasync() to ignore timestamp updates that occur as a > result of pure data overwrites. > > Hence XFS only bumps i_version for pure timestamp updates if the > iversion queried flag is set. IOWs, XFS it is actually doing exactly > what the VFS iversion implementation is telling it to do with > timestamp updates for non-core inode metadata updates. > > That's the fundamental issue here: nfsd has set VFS state that tells > the filesystem to "bump iversion on next persistent inode change", > but the nfsd then runs operations that can change non-critical > persistent inode state in "query-only" operations. It then expects > filesystems to know that it should ignore the iversion queried state > within this context. However, without external behavioural control > flags, filesystems cannot know that an isolated metadata update has > context specific iversion behavioural constraints. > Hence fixing this is purely a VFS/nfsd i_version implementation > problem - if the nfsd is running a querying operation, it should > tell the filesystem that it should ignore iversion query state. If > nothing the application level cache cares about is being changed > during the query operation, it should tell the filesystem to ignore > iversion query state because it is likely the nfsd query itself will > set it (or have already set it itself in the case of compound > operations). > > This does not need XFS on-disk format changes to fix. This does not > need changes to timestamp infrastructure to fix. We just need the > nfsd application to tell us that we should ignore the vfs i_version > query state when we update non-core inode metadata within query > operation contexts. > I think you're missing the point of the problem I'm trying to solve. I'm not necessarily trying to guard nfsd against its own accesses. The reads that trigger an eventual atime update could come from anywhere -- nfsd, userland accesses, etc. If you are serving an XFS filesystem, with the (default) relatime mount option, then you are guaranteed that the clients will invalidate their cache of a file once per day, assuming that at least one read was issued against the file during that day. That read will cause an eventual atime bump to be logged, at which point the change attribute will change. The client will then assume that it needs to invalidate its cache when it sees that change. Changing how nfsd does its own accesses won't fix anything, because the problematic atime bump can come from any sort of read access. -- Jeff Layton <jlayton@kernel.org>
On Mon, Oct 23, 2023 at 10:45:21AM -0400, Jeff Layton wrote: > On Mon, 2023-10-23 at 09:17 +1100, Dave Chinner wrote: > > All I'm suggesting is that rather than using mount options for > > noatime-like behaviour for NFSD accesses, we actually have the nfsd > > accesses say "we'd like pure atime updates without iversion, please". > > > > Keep in mind that XFS does actually try to avoid bumping i_version > > on pure timestamp updates - we carved that out a long time ago (see > > the difference in XFS_ILOG_CORE vs XFS_ILOG_TIMESTAMP in > > xfs_vn_update_time() and xfs_trans_log_inode()) so that we could > > optimise fdatasync() to ignore timestamp updates that occur as a > > result of pure data overwrites. > > > > Hence XFS only bumps i_version for pure timestamp updates if the > > iversion queried flag is set. IOWs, XFS it is actually doing exactly > > what the VFS iversion implementation is telling it to do with > > timestamp updates for non-core inode metadata updates. > > > > That's the fundamental issue here: nfsd has set VFS state that tells > > the filesystem to "bump iversion on next persistent inode change", > > but the nfsd then runs operations that can change non-critical > > persistent inode state in "query-only" operations. It then expects > > filesystems to know that it should ignore the iversion queried state > > within this context. However, without external behavioural control > > flags, filesystems cannot know that an isolated metadata update has > > context specific iversion behavioural constraints. > > > Hence fixing this is purely a VFS/nfsd i_version implementation > > problem - if the nfsd is running a querying operation, it should > > tell the filesystem that it should ignore iversion query state. If > > nothing the application level cache cares about is being changed > > during the query operation, it should tell the filesystem to ignore > > iversion query state because it is likely the nfsd query itself will > > set it (or have already set it itself in the case of compound > > operations). > > > > This does not need XFS on-disk format changes to fix. This does not > > need changes to timestamp infrastructure to fix. We just need the > > nfsd application to tell us that we should ignore the vfs i_version > > query state when we update non-core inode metadata within query > > operation contexts. > > > > I think you're missing the point of the problem I'm trying to solve. > I'm not necessarily trying to guard nfsd against its own accesses. The > reads that trigger an eventual atime update could come from anywhere -- > nfsd, userland accesses, etc. > > If you are serving an XFS filesystem, with the (default) relatime mount > option, then you are guaranteed that the clients will invalidate their > cache of a file once per day, assuming that at least one read was issued > against the file during that day. > > That read will cause an eventual atime bump to be logged, at which point > the change attribute will change. The client will then assume that it > needs to invalidate its cache when it sees that change. > > Changing how nfsd does its own accesses won't fix anything, because the > problematic atime bump can come from any sort of read access. I'm not missing the point at all - as I've said in the past I don't think local vs remote access is in any way relevant to the original problem that needs to be solved. If the local access is within the relatime window, it won't cause any persistent metadata change at all. If it's outside the window, then it's no different to the NFS client reading data from the server outside the window. If it's the first access after a NFS client side modification, then it's just really bad timing but it isn't likely to be a common issue. Hence I just don't think it matters on bit, and we can address the 24 hour problem separately to the original problem that still needs to be fixed. The problem is the first read request after a modification has been made. That is causing relatime to see mtime > atime and triggering an atime update. XFS sees this, does an atime update, and in committing that persistent inode metadata update, it calls inode_maybe_inc_iversion(force = false) to check if an iversion update is necessary. The VFS sees I_VERSION_QUERIED, and so it bumps i_version and tells XFS to persist it. IOWs, XFS is doing exactly what the VFS is telling it to do with i_version during the persistent inode metadata update that the VFS told it to make. This, however, is not the semantics that the *nfsd application* wants. It does not want i_version to be updated when it is running a data read operation despite the fact the VFS is telling the filesystem it needs to be updated. What we need to know is when the inode is being accessed by the nfsd so we can change the in-memory timestamp update behaviour appropriately. We really don't need on-disk format changes - we just need to know that we're supposed to do something special with pure timestamp updates because i_version needs to be behave in a manner compatible with the new NFS requirements.... We also don't need generic timestamp infrastructure changes to do this - the multi-grained timestamp was a neat idea for generic filesystem support of the nfsd i_version requirements, but it's collapsed under the weight of complexity. There are simpler ways individual filesystems can do the right thing, but to do that we need to know that nfsd has actively referenced the inode. How we get that information is what I want to resolve, the filesystem should be able to handle everything else in memory.... Perhaps we can extract I_VERSION_QUERIED as a proxy for nfsd activity on the inode rather than need a per-operation context? Is that going to be reliable enough? Will that cause problems for other applications that want to use i_version for their own purposes? Cheers, Dave. -- Dave Chinner david@fromorbit.com
On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote:
>
> The problem is the first read request after a modification has been
> made. That is causing relatime to see mtime > atime and triggering
> an atime update. XFS sees this, does an atime update, and in
> committing that persistent inode metadata update, it calls
> inode_maybe_inc_iversion(force = false) to check if an iversion
> update is necessary. The VFS sees I_VERSION_QUERIED, and so it bumps
> i_version and tells XFS to persist it.
Could we perhaps just have a mode where we don't increment i_version
for just atime updates?
Maybe we don't even need a mode, and could just decide that atime
updates aren't i_version updates at all?
Yes, yes, it's obviously technically a "inode modification", but does
anybody actually *want* atime updates with no actual other changes to
be version events?
Or maybe i_version can update, but callers of getattr() could have two
bits for that STATX_CHANGE_COOKIE, one for "I care about atime" and
one for others, and we'd pass that down to inode_query_version, and
we'd have a I_VERSION_QUERIED and a I_VERSION_QUERIED_STRICT, and the
"I care about atime" case ould set the strict one.
Then inode_maybe_inc_iversion() could - for atome updates - skip the
version update *unless* it sees that I_VERSION_QUERIED_STRICT bit.
Does that sound sane to people?
Because it does sound completely insane to me to say "inode changed"
and have a cache invalidation just for an atime update.
Linus
On Mon, 2023-10-23 at 14:18 -1000, Linus Torvalds wrote: > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote: > > > > The problem is the first read request after a modification has been > > made. That is causing relatime to see mtime > atime and triggering > > an atime update. XFS sees this, does an atime update, and in > > committing that persistent inode metadata update, it calls > > inode_maybe_inc_iversion(force = false) to check if an iversion > > update is necessary. The VFS sees I_VERSION_QUERIED, and so it bumps > > i_version and tells XFS to persist it. > > Could we perhaps just have a mode where we don't increment i_version > for just atime updates? > > Maybe we don't even need a mode, and could just decide that atime > updates aren't i_version updates at all? > > Yes, yes, it's obviously technically a "inode modification", but does > anybody actually *want* atime updates with no actual other changes to > be version events? > > Or maybe i_version can update, but callers of getattr() could have two > bits for that STATX_CHANGE_COOKIE, one for "I care about atime" and > one for others, and we'd pass that down to inode_query_version, and > we'd have a I_VERSION_QUERIED and a I_VERSION_QUERIED_STRICT, and the > "I care about atime" case ould set the strict one. > > Then inode_maybe_inc_iversion() could - for atome updates - skip the > version update *unless* it sees that I_VERSION_QUERIED_STRICT bit. > > Does that sound sane to people? > > Because it does sound completely insane to me to say "inode changed" > and have a cache invalidation just for an atime update. > The new flag idea is a good one. The catch though is that there are no readers of i_version in-kernel other than NFSD and IMA, so there would be no in-kernel users of I_VERSION_QUERIED_STRICT. In earlier discussions, I was given to believe that the problem with changing how this works in XFS involved offline filesystem access tools. That said, I didn't press for enough details at the time, so I may have misunderstood Dave's reticence to change how this works. -- Jeff Layton <jlayton@kernel.org>
On Tue, 24 Oct 2023 at 09:07, Jeff Layton <jlayton@kernel.org> wrote:
>
> The new flag idea is a good one. The catch though is that there are no
> readers of i_version in-kernel other than NFSD and IMA, so there would
> be no in-kernel users of I_VERSION_QUERIED_STRICT.
I actually see that as an absolute positive.
I think we should *conceptually* do those two flags, but then realize
that there are no users of the STRICT version, and just skip it.
So practically speaking, we'd end up with just a weaker version of
I_VERSION_QUERIED that is that "I don't care about atime" case.
I really can't find any use that would *want* to see i_version updates
for any atime updates. Ever.
We may have had historical user interfaces for i_version, but I can't
find any currently.
But to be very very clear: I've only done some random grepping, and I
may have missed something. I'm not dismissing Dave's worries, and he
may well be entirely correct.
Somebody would need to do a much more careful check than my "I can't
find anything".
Linus
On Tue, 2023-10-24 at 09:40 -1000, Linus Torvalds wrote: > On Tue, 24 Oct 2023 at 09:07, Jeff Layton <jlayton@kernel.org> wrote: > > > > The new flag idea is a good one. The catch though is that there are no > > readers of i_version in-kernel other than NFSD and IMA, so there would > > be no in-kernel users of I_VERSION_QUERIED_STRICT. > > I actually see that as an absolute positive. > > I think we should *conceptually* do those two flags, but then realize > that there are no users of the STRICT version, and just skip it. > > So practically speaking, we'd end up with just a weaker version of > I_VERSION_QUERIED that is that "I don't care about atime" case. > To be clear, this is not kernel-wide behavior. Most filesystems already don't bump their i_version on atime updates. XFS is the only one that does. ext4 used to do that too, but we fixed that several months ago. I did try to just fix XFS in the same way, but the patch was NAK'ed. > I really can't find any use that would *want* to see i_version updates > for any atime updates. Ever. > > We may have had historical user interfaces for i_version, but I can't > find any currently. > > But to be very very clear: I've only done some random grepping, and I > may have missed something. I'm not dismissing Dave's worries, and he > may well be entirely correct. > > Somebody would need to do a much more careful check than my "I can't > find anything". Exactly. I'm not really an XFS guy, so I took those folks at their word that this was behavior that they just can't trivially change. None of the in-kernel callers that look at i_version want it to be incremented on atime-onlt updates, however. So IIRC, the objection was due to offline repair/analysis tools that depend this the value being incremented in a specific way. -- Jeff Layton <jlayton@kernel.org>
On Mon, Oct 23, 2023 at 02:18:12PM -1000, Linus Torvalds wrote: > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote: > > > > The problem is the first read request after a modification has been > > made. That is causing relatime to see mtime > atime and triggering > > an atime update. XFS sees this, does an atime update, and in > > committing that persistent inode metadata update, it calls > > inode_maybe_inc_iversion(force = false) to check if an iversion > > update is necessary. The VFS sees I_VERSION_QUERIED, and so it bumps > > i_version and tells XFS to persist it. > > Could we perhaps just have a mode where we don't increment i_version > for just atime updates? > > Maybe we don't even need a mode, and could just decide that atime > updates aren't i_version updates at all? We do that already - in memory atime updates don't bump i_version at all. The issue is the rare persistent atime update requests that still happen - they are the ones that trigger an i_version bump on XFS, and one of the relatime heuristics tickle this specific issue. If we push the problematic persistent atime updates to be in-memory updates only, then the whole problem with i_version goes away.... > Yes, yes, it's obviously technically a "inode modification", but does > anybody actually *want* atime updates with no actual other changes to > be version events? Well, yes, there was. That's why we defined i_version in the on disk format this way well over a decade ago. It was part of some deep dark magical HSM beans that allowed the application to combine multiple scans for different inode metadata changes into a single pass. atime changes was one of the things it needed to know about for tiering and space scavenging purposes.... > Or maybe i_version can update, but callers of getattr() could have two > bits for that STATX_CHANGE_COOKIE, one for "I care about atime" and > one for others, and we'd pass that down to inode_query_version, and > we'd have a I_VERSION_QUERIED and a I_VERSION_QUERIED_STRICT, and the > "I care about atime" case ould set the strict one. This makes correct behaviour reliant on the applicaiton using the query mechanism correctly. I have my doubts that userspace developers will be able to understand the subtle difference between the two options and always choose correctly.... And then there's always the issue that we might end up with both flags set and we get conflicting bug reports about how atime is not behaving the way the applications want it to behave. > Then inode_maybe_inc_iversion() could - for atome updates - skip the > version update *unless* it sees that I_VERSION_QUERIED_STRICT bit. > > Does that sound sane to people? I'd much prefer we just do the right thing transparently at the filesystem level; all we need is for the inode to be flagged that it should be doing in memory atime updates rather than persistent updates. Perhaps the nfs server should just set a new S_LAZYTIME flag on inodes it accesses similar to how we can set S_NOATIME on inodes to elide atime updates altogether. Once set, the inode will behave that way until it is reclaimed from memory.... Cheers, Dave. -- Dave Chinner david@fromorbit.com
On Tue, 2023-10-24 at 14:40 +1100, Dave Chinner wrote:
> On Mon, Oct 23, 2023 at 02:18:12PM -1000, Linus Torvalds wrote:
> > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote:
> > >
> > > The problem is the first read request after a modification has been
> > > made. That is causing relatime to see mtime > atime and triggering
> > > an atime update. XFS sees this, does an atime update, and in
> > > committing that persistent inode metadata update, it calls
> > > inode_maybe_inc_iversion(force = false) to check if an iversion
> > > update is necessary. The VFS sees I_VERSION_QUERIED, and so it bumps
> > > i_version and tells XFS to persist it.
> >
> > Could we perhaps just have a mode where we don't increment i_version
> > for just atime updates?
> >
> > Maybe we don't even need a mode, and could just decide that atime
> > updates aren't i_version updates at all?
>
> We do that already - in memory atime updates don't bump i_version at
> all. The issue is the rare persistent atime update requests that
> still happen - they are the ones that trigger an i_version bump on
> XFS, and one of the relatime heuristics tickle this specific issue.
>
> If we push the problematic persistent atime updates to be in-memory
> updates only, then the whole problem with i_version goes away....
>
POSIX (more or less) states that if you're updating the inode for any
reason other than an atime update, then you need to update the ctime.
The NFSv4 spec (more or less) defines the change attribute as something
that should show a change any time the ctime would change.
Now, from mount(8) manpage, in the section on lazytime:
The on-disk timestamps are updated only when:
• the inode needs to be updated for some change unrelated to file timestamps
• the application employs fsync(2), syncfs(2), or sync(2)
• an undeleted inode is evicted from memory
• more than 24 hours have passed since the inode was written to disk.
The first is not a problem for NFS. If we're updating the ctime or mtime
or other attributes, then we _need_ to bump the change attribute
(assuming that something has queried it and can tell a difference).
The second is potentially an issue. If a file has an in-memory atime
update and them someone calls sync(2), XFS will end up bumping the
change attribute.
Ditto for the third. If someone does a getattr and brings an inode back
into core, then you're looking at a cache invalidation on the client.
The fourth is also a problem, once a day your clients will end up
invaliding their caches.
You might think "so what? Once a day is no big deal!", but there are
places that have built up cascading fscache setups across WANs to
distribute large, read-mostly files. This is quite problematic in these
sorts of setups as they end up seeing cache invalidations all over the
place.
noatime is a workaround, but it has its own problems and ugliness, and
it sucks that XFS doesn't "just work" when served by NFS.
> > Yes, yes, it's obviously technically a "inode modification", but does
> > anybody actually *want* atime updates with no actual other changes to
> > be version events?
>
> Well, yes, there was. That's why we defined i_version in the on disk
> format this way well over a decade ago. It was part of some deep
> dark magical HSM beans that allowed the application to combine
> multiple scans for different inode metadata changes into a single
> pass. atime changes was one of the things it needed to know about
> for tiering and space scavenging purposes....
>
As Amir points out, is this still behavior that you're required to
preserve? NFS serving is a bit more common than weird HSM stuff.
Maybe newly created XFS filesystems could be made to update i_version in
a way that nfsd expects? We could add a mkfs.xfs option to allow for the
legacy behavior if required.
> > Or maybe i_version can update, but callers of getattr() could have two
> > bits for that STATX_CHANGE_COOKIE, one for "I care about atime" and
> > one for others, and we'd pass that down to inode_query_version, and
> > we'd have a I_VERSION_QUERIED and a I_VERSION_QUERIED_STRICT, and the
> > "I care about atime" case ould set the strict one.
>
> This makes correct behaviour reliant on the applicaiton using the
> query mechanism correctly. I have my doubts that userspace
> developers will be able to understand the subtle difference between
> the two options and always choose correctly....
>
> And then there's always the issue that we might end up with both
> flags set and we get conflicting bug reports about how atime is not
> behaving the way the applications want it to behave.
>
> > Then inode_maybe_inc_iversion() could - for atome updates - skip the
> > version update *unless* it sees that I_VERSION_QUERIED_STRICT bit.
> >
> > Does that sound sane to people?
>
> I'd much prefer we just do the right thing transparently at the
> filesystem level; all we need is for the inode to be flagged that it
> should be doing in memory atime updates rather than persistent
> updates.
>
> Perhaps the nfs server should just set a new S_LAZYTIME flag on
> inodes it accesses similar to how we can set S_NOATIME on inodes to
> elide atime updates altogether. Once set, the inode will behave that
> way until it is reclaimed from memory....
>
Yeah, I think adding this sort of extra complexity on the query side is
probably not what's needed.
I'm also not crazy about trying to treat nfsd's accesses as special in
some way. nfsd is really not doing anything special at all, other than
querying for STATX_CHANGE_COOKIE.
The problem is on the update side: nfsd expects the STATX_CHANGE_COOKIE
to conform to certain behavior, and XFS simply does not (specifically
around updates to the inode that only change the atime).
--
Jeff Layton <jlayton@kernel.org>
On Tue, Oct 24, 2023 at 6:40 AM Dave Chinner <david@fromorbit.com> wrote: > > On Mon, Oct 23, 2023 at 02:18:12PM -1000, Linus Torvalds wrote: > > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote: > > > > > > The problem is the first read request after a modification has been > > > made. That is causing relatime to see mtime > atime and triggering > > > an atime update. XFS sees this, does an atime update, and in > > > committing that persistent inode metadata update, it calls > > > inode_maybe_inc_iversion(force = false) to check if an iversion > > > update is necessary. The VFS sees I_VERSION_QUERIED, and so it bumps > > > i_version and tells XFS to persist it. > > > > Could we perhaps just have a mode where we don't increment i_version > > for just atime updates? > > > > Maybe we don't even need a mode, and could just decide that atime > > updates aren't i_version updates at all? > > We do that already - in memory atime updates don't bump i_version at > all. The issue is the rare persistent atime update requests that > still happen - they are the ones that trigger an i_version bump on > XFS, and one of the relatime heuristics tickle this specific issue. > > If we push the problematic persistent atime updates to be in-memory > updates only, then the whole problem with i_version goes away.... > > > Yes, yes, it's obviously technically a "inode modification", but does > > anybody actually *want* atime updates with no actual other changes to > > be version events? > > Well, yes, there was. That's why we defined i_version in the on disk > format this way well over a decade ago. It was part of some deep > dark magical HSM beans that allowed the application to combine > multiple scans for different inode metadata changes into a single > pass. atime changes was one of the things it needed to know about > for tiering and space scavenging purposes.... > But if this is such an ancient mystical program, why do we have to keep this XFS behavior in the present? BTW, is this the same HSM whose DMAPI ioctls were deprecated a few years back? I mean, I understand that you do not want to change the behavior of i_version update without an opt-in config or mount option - let the distro make that choice. But calling this an "on-disk format change" is a very long stretch. Does xfs_repair guarantee that changes of atime, or any inode changes for that matter, update i_version? No, it does not. So IMO, "atime does not update i_version" is not an "on-disk format change", it is a runtime behavior change, just like lazytime is. Thanks, Amir.
On Tue, 2023-10-24 at 10:08 +0300, Amir Goldstein wrote: > On Tue, Oct 24, 2023 at 6:40 AM Dave Chinner <david@fromorbit.com> wrote: > > > > On Mon, Oct 23, 2023 at 02:18:12PM -1000, Linus Torvalds wrote: > > > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote: > > > > > > > > The problem is the first read request after a modification has been > > > > made. That is causing relatime to see mtime > atime and triggering > > > > an atime update. XFS sees this, does an atime update, and in > > > > committing that persistent inode metadata update, it calls > > > > inode_maybe_inc_iversion(force = false) to check if an iversion > > > > update is necessary. The VFS sees I_VERSION_QUERIED, and so it bumps > > > > i_version and tells XFS to persist it. > > > > > > Could we perhaps just have a mode where we don't increment i_version > > > for just atime updates? > > > > > > Maybe we don't even need a mode, and could just decide that atime > > > updates aren't i_version updates at all? > > > > We do that already - in memory atime updates don't bump i_version at > > all. The issue is the rare persistent atime update requests that > > still happen - they are the ones that trigger an i_version bump on > > XFS, and one of the relatime heuristics tickle this specific issue. > > > > If we push the problematic persistent atime updates to be in-memory > > updates only, then the whole problem with i_version goes away.... > > > > > Yes, yes, it's obviously technically a "inode modification", but does > > > anybody actually *want* atime updates with no actual other changes to > > > be version events? > > > > Well, yes, there was. That's why we defined i_version in the on disk > > format this way well over a decade ago. It was part of some deep > > dark magical HSM beans that allowed the application to combine > > multiple scans for different inode metadata changes into a single > > pass. atime changes was one of the things it needed to know about > > for tiering and space scavenging purposes.... > > > > But if this is such an ancient mystical program, why do we have to > keep this XFS behavior in the present? > BTW, is this the same HSM whose DMAPI ioctls were deprecated > a few years back? > > I mean, I understand that you do not want to change the behavior of > i_version update without an opt-in config or mount option - let the distro > make that choice. > But calling this an "on-disk format change" is a very long stretch. > > Does xfs_repair guarantee that changes of atime, or any inode changes > for that matter, update i_version? No, it does not. > So IMO, "atime does not update i_version" is not an "on-disk format change", > it is a runtime behavior change, just like lazytime is. > This would certainly be my preference. I don't want to break any existing users though. Perhaps this ought to be a mkfs option? Existing XFS filesystems could still behave with the legacy behavior, but we could make mkfs.xfs build filesystems by default that work like NFS requires. -- Jeff Layton <jlayton@kernel.org>
On Tue, Oct 24, 2023 at 02:40:06PM -0400, Jeff Layton wrote: > On Tue, 2023-10-24 at 10:08 +0300, Amir Goldstein wrote: > > On Tue, Oct 24, 2023 at 6:40 AM Dave Chinner <david@fromorbit.com> wrote: > > > > > > On Mon, Oct 23, 2023 at 02:18:12PM -1000, Linus Torvalds wrote: > > > > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote: > > > > > > > > > > The problem is the first read request after a modification has been > > > > > made. That is causing relatime to see mtime > atime and triggering > > > > > an atime update. XFS sees this, does an atime update, and in > > > > > committing that persistent inode metadata update, it calls > > > > > inode_maybe_inc_iversion(force = false) to check if an iversion > > > > > update is necessary. The VFS sees I_VERSION_QUERIED, and so it bumps > > > > > i_version and tells XFS to persist it. > > > > > > > > Could we perhaps just have a mode where we don't increment i_version > > > > for just atime updates? > > > > > > > > Maybe we don't even need a mode, and could just decide that atime > > > > updates aren't i_version updates at all? > > > > > > We do that already - in memory atime updates don't bump i_version at > > > all. The issue is the rare persistent atime update requests that > > > still happen - they are the ones that trigger an i_version bump on > > > XFS, and one of the relatime heuristics tickle this specific issue. > > > > > > If we push the problematic persistent atime updates to be in-memory > > > updates only, then the whole problem with i_version goes away.... > > > > > > > Yes, yes, it's obviously technically a "inode modification", but does > > > > anybody actually *want* atime updates with no actual other changes to > > > > be version events? > > > > > > Well, yes, there was. That's why we defined i_version in the on disk > > > format this way well over a decade ago. It was part of some deep > > > dark magical HSM beans that allowed the application to combine > > > multiple scans for different inode metadata changes into a single > > > pass. atime changes was one of the things it needed to know about > > > for tiering and space scavenging purposes.... > > > > > > > But if this is such an ancient mystical program, why do we have to > > keep this XFS behavior in the present? > > BTW, is this the same HSM whose DMAPI ioctls were deprecated > > a few years back? Drop the attitude, Amir. That "ancient mystical program" is this: https://buy.hpe.com/us/en/enterprise-solutions/high-performance-computing-solutions/high-performance-computing-storage-solutions/hpc-storage-solutions/hpe-data-management-framework-7/p/1010144088 Yup, that product is backed by a proprietary descendent of the Irix XFS code base XFS that is DMAPI enabled and still in use today. It's called HPE XFS these days.... > > I mean, I understand that you do not want to change the behavior of > > i_version update without an opt-in config or mount option - let the distro > > make that choice. > > But calling this an "on-disk format change" is a very long stretch. Telling the person who created, defined and implemented the on disk format that they don't know what constitutes a change of that on-disk format seems kinda Dunning-Kruger to me.... There are *lots* of ways that di_changecount is now incompatible with the VFS change counter. That's now defined as "i_version should only change when [cm]time is changed". di_changecount is defined to be a count of the number of changes made to the attributes of the inode. It's not just atime at issue here - we bump di_changecount when make any inode change, including background work that does not otherwise change timestamps. e.g. allocation at writeback time, unwritten extent conversion, on-disk EOF extension at IO completion, removal of speculative pre-allocation beyond EOF, etc. IOWs, di_changecount was never defined as a linux "i_version" counter, regardless of the fact we originally we able to implement i_version with it - all extra bumps to di_changecount were not important to the users of i_version for about a decade. Unfortunately, the new i_version definition is very much incompatible with the existing di_changecount definition and that's the underlying problem here. i.e. the problem is not that we bump i_version on atime, it's that di_changecount is now completely incompatible with the new i_version change semantics. To implement the new i_version semantics exactly, we need to add a new field to the inode to hold this information. If we change the on disk format like this, then the atime problems go away because the new field would not get updated on atime updates. We'd still be bumping di_changecount on atime updates, though, because that's what is required by the on-disk format. I'm really trying to avoid changing the on-disk format unless it is absolutely necessary. If we can get the in-memory timestamp updates to avoid tripping di_changecount updates then the atime problems go away. If we can get [cm]time sufficiently fine grained that we don't need i_version, then we can turn off i_version in XFS and di_changecount ends up being entirely internal. That's what was attempted with generic multi-grain timestamps, but that hasn't worked. Another options is for XFS to play it's own internal tricks with [cm]time granularity and turn off i_version. e.g. limit external timestamp visibility to 1us and use the remaining dozen bits of the ns field to hold a change counter for updates within a single coarse timer tick. This guarantees the timestamp changes within a coarse tick for the purposes of change detection, but we don't expose those bits to applications so applications that compare timestamps across inodes won't get things back to front like was happening with the multi-grain timestamps.... Another option is to work around the visible symptoms of the semantic mismatch between i_version and di_changecount. The only visible symptom we currently know about is the atime vs i_version issue. If people are happy for us to simply ignore VFS atime guidelines (i.e. ignore realtime/lazytime) and do completely our own stuff with timestamp update deferal, then that also solve the immediate issues. > > Does xfs_repair guarantee that changes of atime, or any inode changes > > for that matter, update i_version? No, it does not. > > So IMO, "atime does not update i_version" is not an "on-disk format change", > > it is a runtime behavior change, just like lazytime is. > > This would certainly be my preference. I don't want to break any > existing users though. That's why I'm trying to get some kind of consensus on what rules and/or atime configurations people are happy for me to break to make it look to users like there's a viable working change attribute being supplied by XFS without needing to change the on disk format. > Perhaps this ought to be a mkfs option? Existing XFS filesystems could > still behave with the legacy behavior, but we could make mkfs.xfs build > filesystems by default that work like NFS requires. If we require mkfs to set a flag to change behaviour, then we're talking about making an explicit on-disk format change to select the optional behaviour. That's precisely what I want to avoid. -Dave. -- Dave Chinner david@fromorbit.com
On Wed, 2023-10-25 at 19:05 +1100, Dave Chinner wrote: > On Tue, Oct 24, 2023 at 02:40:06PM -0400, Jeff Layton wrote: > > On Tue, 2023-10-24 at 10:08 +0300, Amir Goldstein wrote: > > > On Tue, Oct 24, 2023 at 6:40 AM Dave Chinner <david@fromorbit.com> wrote: > > > > > > > > On Mon, Oct 23, 2023 at 02:18:12PM -1000, Linus Torvalds wrote: > > > > > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote: > > > > > > > > > > > > The problem is the first read request after a modification has been > > > > > > made. That is causing relatime to see mtime > atime and triggering > > > > > > an atime update. XFS sees this, does an atime update, and in > > > > > > committing that persistent inode metadata update, it calls > > > > > > inode_maybe_inc_iversion(force = false) to check if an iversion > > > > > > update is necessary. The VFS sees I_VERSION_QUERIED, and so it bumps > > > > > > i_version and tells XFS to persist it. > > > > > > > > > > Could we perhaps just have a mode where we don't increment i_version > > > > > for just atime updates? > > > > > > > > > > Maybe we don't even need a mode, and could just decide that atime > > > > > updates aren't i_version updates at all? > > > > > > > > We do that already - in memory atime updates don't bump i_version at > > > > all. The issue is the rare persistent atime update requests that > > > > still happen - they are the ones that trigger an i_version bump on > > > > XFS, and one of the relatime heuristics tickle this specific issue. > > > > > > > > If we push the problematic persistent atime updates to be in-memory > > > > updates only, then the whole problem with i_version goes away.... > > > > > > > > > Yes, yes, it's obviously technically a "inode modification", but does > > > > > anybody actually *want* atime updates with no actual other changes to > > > > > be version events? > > > > > > > > Well, yes, there was. That's why we defined i_version in the on disk > > > > format this way well over a decade ago. It was part of some deep > > > > dark magical HSM beans that allowed the application to combine > > > > multiple scans for different inode metadata changes into a single > > > > pass. atime changes was one of the things it needed to know about > > > > for tiering and space scavenging purposes.... > > > > > > > > > > But if this is such an ancient mystical program, why do we have to > > > keep this XFS behavior in the present? > > > BTW, is this the same HSM whose DMAPI ioctls were deprecated > > > a few years back? > > Drop the attitude, Amir. > > That "ancient mystical program" is this: > > https://buy.hpe.com/us/en/enterprise-solutions/high-performance-computing-solutions/high-performance-computing-storage-solutions/hpc-storage-solutions/hpe-data-management-framework-7/p/1010144088 > > Yup, that product is backed by a proprietary descendent of the Irix > XFS code base XFS that is DMAPI enabled and still in use today. It's > called HPE XFS these days.... > > > > I mean, I understand that you do not want to change the behavior of > > > i_version update without an opt-in config or mount option - let the distro > > > make that choice. > > > But calling this an "on-disk format change" is a very long stretch. > > Telling the person who created, defined and implemented the on disk > format that they don't know what constitutes a change of that > on-disk format seems kinda Dunning-Kruger to me.... > > There are *lots* of ways that di_changecount is now incompatible > with the VFS change counter. That's now defined as "i_version should > only change when [cm]time is changed". > > di_changecount is defined to be a count of the number of changes > made to the attributes of the inode. It's not just atime at issue > here - we bump di_changecount when make any inode change, including > background work that does not otherwise change timestamps. e.g. > allocation at writeback time, unwritten extent conversion, on-disk > EOF extension at IO completion, removal of speculative > pre-allocation beyond EOF, etc. > > IOWs, di_changecount was never defined as a linux "i_version" > counter, regardless of the fact we originally we able to implement > i_version with it - all extra bumps to di_changecount were not > important to the users of i_version for about a decade. > > Unfortunately, the new i_version definition is very much > incompatible with the existing di_changecount definition and that's > the underlying problem here. i.e. the problem is not that we bump > i_version on atime, it's that di_changecount is now completely > incompatible with the new i_version change semantics. > > To implement the new i_version semantics exactly, we need to add a > new field to the inode to hold this information. > If we change the on disk format like this, then the atime > problems go away because the new field would not get updated on > atime updates. We'd still be bumping di_changecount on atime > updates, though, because that's what is required by the on-disk > format. > > I'm really trying to avoid changing the on-disk format unless it > is absolutely necessary. If we can get the in-memory timestamp > updates to avoid tripping di_changecount updates then the atime > problems go away. > > If we can get [cm]time sufficiently fine grained that we don't need > i_version, then we can turn off i_version in XFS and di_changecount > ends up being entirely internal. That's what was attempted with > generic multi-grain timestamps, but that hasn't worked. > > Another options is for XFS to play it's own internal tricks with > [cm]time granularity and turn off i_version. e.g. limit external > timestamp visibility to 1us and use the remaining dozen bits of the > ns field to hold a change counter for updates within a single coarse > timer tick. This guarantees the timestamp changes within a coarse > tick for the purposes of change detection, but we don't expose those > bits to applications so applications that compare timestamps across > inodes won't get things back to front like was happening with the > multi-grain timestamps.... > > Another option is to work around the visible symptoms of the > semantic mismatch between i_version and di_changecount. The only > visible symptom we currently know about is the atime vs i_version > issue. If people are happy for us to simply ignore VFS atime > guidelines (i.e. ignore realtime/lazytime) and do completely our own > stuff with timestamp update deferal, then that also solve the > immediate issues. > > > > Does xfs_repair guarantee that changes of atime, or any inode changes > > > for that matter, update i_version? No, it does not. > > > So IMO, "atime does not update i_version" is not an "on-disk format change", > > > it is a runtime behavior change, just like lazytime is. > > > > This would certainly be my preference. I don't want to break any > > existing users though. > > That's why I'm trying to get some kind of consensus on what > rules and/or atime configurations people are happy for me to break > to make it look to users like there's a viable working change > attribute being supplied by XFS without needing to change the on > disk format. > I agree that the only bone of contention is whether to count atime updates against the change attribute. I think we have consensus that all in-kernel users do _not_ want atime updates counted against the change attribute. The only real question is these "legacy" users of di_changecount. > > Perhaps this ought to be a mkfs option? Existing XFS filesystems could > > still behave with the legacy behavior, but we could make mkfs.xfs build > > filesystems by default that work like NFS requires. > > If we require mkfs to set a flag to change behaviour, then we're > talking about making an explicit on-disk format change to select the > optional behaviour. That's precisely what I want to avoid. > Right. The on-disk di_changecount would have a (subtly) different meaning at that point. It's not a change that requires drastic retooling though. If we were to do this, we wouldn't need to grow the on-disk inode. Booting to an older kernel would cause the behavior to revert. That's sub-optimal, but not fatal. What I don't quite understand is how these tools are accessing di_changecount? XFS only accesses the di_changecount to propagate the value to and from the i_version, and there is nothing besides NFSD and IMA that queries the i_version value in-kernel. So, this must be done via some sort of userland tool that is directly accessing the block device (or some 3rd party kernel module). In earlier discussions you alluded to some repair and/or analysis tools that depended on this counter. I took a quick look in xfsprogs, but I didn't see anything there. Is there a library or something that these tools use to get at this value? -- Jeff Layton <jlayton@kernel.org>
On Wed, Oct 25, 2023 at 08:25:35AM -0400, Jeff Layton wrote: > On Wed, 2023-10-25 at 19:05 +1100, Dave Chinner wrote: > > On Tue, Oct 24, 2023 at 02:40:06PM -0400, Jeff Layton wrote: > > > On Tue, 2023-10-24 at 10:08 +0300, Amir Goldstein wrote: > > > > On Tue, Oct 24, 2023 at 6:40 AM Dave Chinner <david@fromorbit.com> wrote: > > > > > > > > > > On Mon, Oct 23, 2023 at 02:18:12PM -1000, Linus Torvalds wrote: > > > > > > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote: > > > > Does xfs_repair guarantee that changes of atime, or any inode changes > > > > for that matter, update i_version? No, it does not. > > > > So IMO, "atime does not update i_version" is not an "on-disk format change", > > > > it is a runtime behavior change, just like lazytime is. > > > > > > This would certainly be my preference. I don't want to break any > > > existing users though. > > > > That's why I'm trying to get some kind of consensus on what > > rules and/or atime configurations people are happy for me to break > > to make it look to users like there's a viable working change > > attribute being supplied by XFS without needing to change the on > > disk format. > > > > I agree that the only bone of contention is whether to count atime > updates against the change attribute. I think we have consensus that all > in-kernel users do _not_ want atime updates counted against the change > attribute. The only real question is these "legacy" users of > di_changecount. Please stop refering to "legacy users" of di_changecount. Whether there are users or not is irrelevant - it is defined by the current on-disk format specification, and as such there may be applications we do not know about making use of the current behaviour. It's like a linux syscall - we can't remove them because there may be some user we don't know about still using that old syscall. We simply don't make changes that can potentially break user applications like that. The on disk format is the same - there is software out that we don't know about that expects a certain behaviour based on the specification. We don't break the on disk format by making silent behavioural changes - we require a feature flag to indicate behaviour has changed so that applications can take appropriate actions with stuff they don't understand. The example for this is the BIGTIME timestamp format change. The on disk inode structure is physically unchanged, but the contents of the timestamp fields are encoded very differently. Sure, the older kernels can read the timestamp data without any sort of problem occurring, except for the fact the timestamps now appear to be completely corrupted. Changing the meaning of ithe contents of di_changecount is no different. It might look OK and nothing crashes, but nothing can be inferred from the value in the field because we don't know how it has been modified. Hence we can't just change the meaning, encoding or behaviour of an on disk field that would result in existing kernels and applications doing the wrong thing with that field (either read or write) without adding a feature flag to indicate what behaviour that field should have. > > > Perhaps this ought to be a mkfs option? Existing XFS filesystems could > > > still behave with the legacy behavior, but we could make mkfs.xfs build > > > filesystems by default that work like NFS requires. > > > > If we require mkfs to set a flag to change behaviour, then we're > > talking about making an explicit on-disk format change to select the > > optional behaviour. That's precisely what I want to avoid. > > > > Right. The on-disk di_changecount would have a (subtly) different > meaning at that point. > > It's not a change that requires drastic retooling though. If we were to > do this, we wouldn't need to grow the on-disk inode. Booting to an older > kernel would cause the behavior to revert. That's sub-optimal, but not > fatal. See above: redefining the contents, behaviour or encoding of an on disk field is a change of the on-disk format specification. The rules for on disk format changes that we work to were set in place long before I started working on XFS. They are sane, well thought out rules that have stood the test of time and massive new feature introductions (CRCs, reflink, rmap, etc). And they only work because we don't allow anyone to bend them for convenience, short cuts or expediting their pet project. > What I don't quite understand is how these tools are accessing > di_changecount? As I keep saying: this is largely irrelevant to the problem at hand. > XFS only accesses the di_changecount to propagate the value to and from > the i_version, Yes. XFS has a strong separation between on-disk structures and in-memory values, and i_version is simply the in-memory field we use to store the current di_changecount value. We force bump i_version every time we modify the inode core regardless of whether anyone has queried i_version because that's what di_changecount requires. i.e. the filesystem controls the contents of i_version, not the VFS. Now that NFS is using a proper abstraction (i.e. vfs_statx()) to get the change cookie, we really don't need to expose di_changecount in i_version at all - we could simply copy an internal di_changecount value into the statx cookie field in xfs_vn_getattr() and there would be almost no change of behaviour from the perspective of NFS and IMA at all. > and there is nothing besides NFSD and IMA that queries > the i_version value in-kernel. So, this must be done via some sort of > userland tool that is directly accessing the block device (or some 3rd > party kernel module). Yup, both of those sort of applications exist. e.g. the DMAPI kernel module allows direct access to inode metadata through a custom bulkstat formatter implementation - it returns different information comapred to the standard XFS one in the upstream kernel. > In earlier discussions you alluded to some repair and/or analysis tools > that depended on this counter. Yes, and one of those "tools" is *me*. I frequently look at the di_changecount when doing forensic and/or failure analysis on filesystem corpses. SOE analysis, relative modification activity, etc all give insight into what happened to the filesystem to get it into the state it is currently in, and di_changecount provides information no other metadata in the inode contains. > I took a quick look in xfsprogs, but I > didn't see anything there. Is there a library or something that these > tools use to get at this value? xfs_db is the tool I use for this, such as: $ sudo xfs_db -c "sb 0" -c "a rootino" -c "p v3.change_count" /dev/mapper/fast v3.change_count = 35 $ The root inode in this filesystem has a change count of 35. The root inode has 32 dirents in it, which means that no entries have ever been removed or renamed. This sort of insight into the past history of inode metadata is largely impossible to get any other way, and it's been the difference between understanding failure and having no clue more than once. Most block device parsing applications simply write their own decoder that walks the on-disk format. That's pretty trivial to do, developers can get all the information needed to do this from the on-disk format specification documentation we keep on kernel.org... -Dave. -- Dave Chinner david@fromorbit.com
On Thu, 2023-10-26 at 13:20 +1100, Dave Chinner wrote: > On Wed, Oct 25, 2023 at 08:25:35AM -0400, Jeff Layton wrote: > > On Wed, 2023-10-25 at 19:05 +1100, Dave Chinner wrote: > > > On Tue, Oct 24, 2023 at 02:40:06PM -0400, Jeff Layton wrote: > > > > On Tue, 2023-10-24 at 10:08 +0300, Amir Goldstein wrote: > > > > > On Tue, Oct 24, 2023 at 6:40 AM Dave Chinner <david@fromorbit.com> wrote: > > > > > > > > > > > > On Mon, Oct 23, 2023 at 02:18:12PM -1000, Linus Torvalds wrote: > > > > > > > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote: > > > > > Does xfs_repair guarantee that changes of atime, or any inode changes > > > > > for that matter, update i_version? No, it does not. > > > > > So IMO, "atime does not update i_version" is not an "on-disk format change", > > > > > it is a runtime behavior change, just like lazytime is. > > > > > > > > This would certainly be my preference. I don't want to break any > > > > existing users though. > > > > > > That's why I'm trying to get some kind of consensus on what > > > rules and/or atime configurations people are happy for me to break > > > to make it look to users like there's a viable working change > > > attribute being supplied by XFS without needing to change the on > > > disk format. > > > > > > > I agree that the only bone of contention is whether to count atime > > updates against the change attribute. I think we have consensus that all > > in-kernel users do _not_ want atime updates counted against the change > > attribute. The only real question is these "legacy" users of > > di_changecount. > > Please stop refering to "legacy users" of di_changecount. Whether > there are users or not is irrelevant - it is defined by the current > on-disk format specification, and as such there may be applications > we do not know about making use of the current behaviour. > > It's like a linux syscall - we can't remove them because there may > be some user we don't know about still using that old syscall. We > simply don't make changes that can potentially break user > applications like that. > > The on disk format is the same - there is software out that we don't > know about that expects a certain behaviour based on the > specification. We don't break the on disk format by making silent > behavioural changes - we require a feature flag to indicate > behaviour has changed so that applications can take appropriate > actions with stuff they don't understand. > > The example for this is the BIGTIME timestamp format change. The on > disk inode structure is physically unchanged, but the contents of > the timestamp fields are encoded very differently. Sure, the older > kernels can read the timestamp data without any sort of problem > occurring, except for the fact the timestamps now appear to be > completely corrupted. > > Changing the meaning of ithe contents of di_changecount is no > different. It might look OK and nothing crashes, but nothing can be > inferred from the value in the field because we don't know how it > has been modified. > > Hence we can't just change the meaning, encoding or behaviour of an > on disk field that would result in existing kernels and applications > doing the wrong thing with that field (either read or write) without > adding a feature flag to indicate what behaviour that field should > have. > > > > > Perhaps this ought to be a mkfs option? Existing XFS filesystems could > > > > still behave with the legacy behavior, but we could make mkfs.xfs build > > > > filesystems by default that work like NFS requires. > > > > > > If we require mkfs to set a flag to change behaviour, then we're > > > talking about making an explicit on-disk format change to select the > > > optional behaviour. That's precisely what I want to avoid. > > > > > > > Right. The on-disk di_changecount would have a (subtly) different > > meaning at that point. > > > > It's not a change that requires drastic retooling though. If we were to > > do this, we wouldn't need to grow the on-disk inode. Booting to an older > > kernel would cause the behavior to revert. That's sub-optimal, but not > > fatal. > > See above: redefining the contents, behaviour or encoding of an on > disk field is a change of the on-disk format specification. > > The rules for on disk format changes that we work to were set in > place long before I started working on XFS. They are sane, well > thought out rules that have stood the test of time and massive new > feature introductions (CRCs, reflink, rmap, etc). And they only work > because we don't allow anyone to bend them for convenience, short > cuts or expediting their pet project. > > > What I don't quite understand is how these tools are accessing > > di_changecount? > > As I keep saying: this is largely irrelevant to the problem at hand. > > > XFS only accesses the di_changecount to propagate the value to and from > > the i_version, > > Yes. XFS has a strong separation between on-disk structures and > in-memory values, and i_version is simply the in-memory field we use > to store the current di_changecount value. We force bump i_version > every time we modify the inode core regardless of whether anyone has > queried i_version because that's what di_changecount requires. i.e. > the filesystem controls the contents of i_version, not the VFS. > > Now that NFS is using a proper abstraction (i.e. vfs_statx()) to get > the change cookie, we really don't need to expose di_changecount in > i_version at all - we could simply copy an internal di_changecount > value into the statx cookie field in xfs_vn_getattr() and there > would be almost no change of behaviour from the perspective of NFS > and IMA at all. > > > and there is nothing besides NFSD and IMA that queries > > the i_version value in-kernel. So, this must be done via some sort of > > userland tool that is directly accessing the block device (or some 3rd > > party kernel module). > > Yup, both of those sort of applications exist. e.g. the DMAPI kernel > module allows direct access to inode metadata through a custom > bulkstat formatter implementation - it returns different information > comapred to the standard XFS one in the upstream kernel. > > > In earlier discussions you alluded to some repair and/or analysis tools > > that depended on this counter. > > Yes, and one of those "tools" is *me*. > > I frequently look at the di_changecount when doing forensic and/or > failure analysis on filesystem corpses. SOE analysis, relative > modification activity, etc all give insight into what happened to > the filesystem to get it into the state it is currently in, and > di_changecount provides information no other metadata in the inode > contains. > > > I took a quick look in xfsprogs, but I > > didn't see anything there. Is there a library or something that these > > tools use to get at this value? > > xfs_db is the tool I use for this, such as: > > $ sudo xfs_db -c "sb 0" -c "a rootino" -c "p v3.change_count" /dev/mapper/fast > v3.change_count = 35 > $ > > The root inode in this filesystem has a change count of 35. The root > inode has 32 dirents in it, which means that no entries have ever > been removed or renamed. This sort of insight into the past history > of inode metadata is largely impossible to get any other way, and > it's been the difference between understanding failure and having no > clue more than once. > > Most block device parsing applications simply write their own > decoder that walks the on-disk format. That's pretty trivial to do, > developers can get all the information needed to do this from the > on-disk format specification documentation we keep on kernel.org... > Fair enough. I'm not here to tell you that you guys that you need to change how di_changecount works. If it's too valuable to keep it counting atime-only updates, then so be it. If that's the case however, and given that the multigrain timestamp work is effectively dead, then I don't see an alternative to growing the on- disk inode. Do you? -- Jeff Layton <jlayton@kernel.org>
On Fri, 27 Oct 2023 at 20:36, Jeff Layton <jlayton@kernel.org> wrote: > > On Thu, 2023-10-26 at 13:20 +1100, Dave Chinner wrote: > > On Wed, Oct 25, 2023 at 08:25:35AM -0400, Jeff Layton wrote: > > > On Wed, 2023-10-25 at 19:05 +1100, Dave Chinner wrote: > > > > On Tue, Oct 24, 2023 at 02:40:06PM -0400, Jeff Layton wrote: > > > > > On Tue, 2023-10-24 at 10:08 +0300, Amir Goldstein wrote: > > > > > > On Tue, Oct 24, 2023 at 6:40 AM Dave Chinner <david@fromorbit.com> wrote: > > > > > > > > > > > > > > On Mon, Oct 23, 2023 at 02:18:12PM -1000, Linus Torvalds wrote: > > > > > > > > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote: > > > > > > Does xfs_repair guarantee that changes of atime, or any inode changes > > > > > > for that matter, update i_version? No, it does not. > > > > > > So IMO, "atime does not update i_version" is not an "on-disk format change", > > > > > > it is a runtime behavior change, just like lazytime is. > > > > > > > > > > This would certainly be my preference. I don't want to break any > > > > > existing users though. > > > > > > > > That's why I'm trying to get some kind of consensus on what > > > > rules and/or atime configurations people are happy for me to break > > > > to make it look to users like there's a viable working change > > > > attribute being supplied by XFS without needing to change the on > > > > disk format. > > > > > > > > > > I agree that the only bone of contention is whether to count atime > > > updates against the change attribute. I think we have consensus that all > > > in-kernel users do _not_ want atime updates counted against the change > > > attribute. The only real question is these "legacy" users of > > > di_changecount. > > > > Please stop refering to "legacy users" of di_changecount. Whether > > there are users or not is irrelevant - it is defined by the current > > on-disk format specification, and as such there may be applications > > we do not know about making use of the current behaviour. > > > > It's like a linux syscall - we can't remove them because there may > > be some user we don't know about still using that old syscall. We > > simply don't make changes that can potentially break user > > applications like that. > > > > The on disk format is the same - there is software out that we don't > > know about that expects a certain behaviour based on the > > specification. We don't break the on disk format by making silent > > behavioural changes - we require a feature flag to indicate > > behaviour has changed so that applications can take appropriate > > actions with stuff they don't understand. > > > > The example for this is the BIGTIME timestamp format change. The on > > disk inode structure is physically unchanged, but the contents of > > the timestamp fields are encoded very differently. Sure, the older > > kernels can read the timestamp data without any sort of problem > > occurring, except for the fact the timestamps now appear to be > > completely corrupted. > > > > Changing the meaning of ithe contents of di_changecount is no > > different. It might look OK and nothing crashes, but nothing can be > > inferred from the value in the field because we don't know how it > > has been modified. > > > > Hence we can't just change the meaning, encoding or behaviour of an > > on disk field that would result in existing kernels and applications > > doing the wrong thing with that field (either read or write) without > > adding a feature flag to indicate what behaviour that field should > > have. > > > > > > > Perhaps this ought to be a mkfs option? Existing XFS filesystems could > > > > > still behave with the legacy behavior, but we could make mkfs.xfs build > > > > > filesystems by default that work like NFS requires. > > > > > > > > If we require mkfs to set a flag to change behaviour, then we're > > > > talking about making an explicit on-disk format change to select the > > > > optional behaviour. That's precisely what I want to avoid. > > > > > > > > > > Right. The on-disk di_changecount would have a (subtly) different > > > meaning at that point. > > > > > > It's not a change that requires drastic retooling though. If we were to > > > do this, we wouldn't need to grow the on-disk inode. Booting to an older > > > kernel would cause the behavior to revert. That's sub-optimal, but not > > > fatal. > > > > See above: redefining the contents, behaviour or encoding of an on > > disk field is a change of the on-disk format specification. > > > > The rules for on disk format changes that we work to were set in > > place long before I started working on XFS. They are sane, well > > thought out rules that have stood the test of time and massive new > > feature introductions (CRCs, reflink, rmap, etc). And they only work > > because we don't allow anyone to bend them for convenience, short > > cuts or expediting their pet project. > > > > > What I don't quite understand is how these tools are accessing > > > di_changecount? > > > > As I keep saying: this is largely irrelevant to the problem at hand. > > > > > XFS only accesses the di_changecount to propagate the value to and from > > > the i_version, > > > > Yes. XFS has a strong separation between on-disk structures and > > in-memory values, and i_version is simply the in-memory field we use > > to store the current di_changecount value. We force bump i_version > > every time we modify the inode core regardless of whether anyone has > > queried i_version because that's what di_changecount requires. i.e. > > the filesystem controls the contents of i_version, not the VFS. > > > > Now that NFS is using a proper abstraction (i.e. vfs_statx()) to get > > the change cookie, we really don't need to expose di_changecount in > > i_version at all - we could simply copy an internal di_changecount > > value into the statx cookie field in xfs_vn_getattr() and there > > would be almost no change of behaviour from the perspective of NFS > > and IMA at all. > > > > > and there is nothing besides NFSD and IMA that queries > > > the i_version value in-kernel. So, this must be done via some sort of > > > userland tool that is directly accessing the block device (or some 3rd > > > party kernel module). > > > > Yup, both of those sort of applications exist. e.g. the DMAPI kernel > > module allows direct access to inode metadata through a custom > > bulkstat formatter implementation - it returns different information > > comapred to the standard XFS one in the upstream kernel. > > > > > In earlier discussions you alluded to some repair and/or analysis tools > > > that depended on this counter. > > > > Yes, and one of those "tools" is *me*. > > > > I frequently look at the di_changecount when doing forensic and/or > > failure analysis on filesystem corpses. SOE analysis, relative > > modification activity, etc all give insight into what happened to > > the filesystem to get it into the state it is currently in, and > > di_changecount provides information no other metadata in the inode > > contains. > > > > > I took a quick look in xfsprogs, but I > > > didn't see anything there. Is there a library or something that these > > > tools use to get at this value? > > > > xfs_db is the tool I use for this, such as: > > > > $ sudo xfs_db -c "sb 0" -c "a rootino" -c "p v3.change_count" /dev/mapper/fast > > v3.change_count = 35 > > $ > > > > The root inode in this filesystem has a change count of 35. The root > > inode has 32 dirents in it, which means that no entries have ever > > been removed or renamed. This sort of insight into the past history > > of inode metadata is largely impossible to get any other way, and > > it's been the difference between understanding failure and having no > > clue more than once. > > > > Most block device parsing applications simply write their own > > decoder that walks the on-disk format. That's pretty trivial to do, > > developers can get all the information needed to do this from the > > on-disk format specification documentation we keep on kernel.org... > > > > Fair enough. I'm not here to tell you that you guys that you need to > change how di_changecount works. If it's too valuable to keep it > counting atime-only updates, then so be it. > > If that's the case however, and given that the multigrain timestamp work > is effectively dead, then I don't see an alternative to growing the on- > disk inode. Do you? Would a new mount option be a viable alternative? A new option that would when used change the semantics of these fields to what NFS needs? With the caveat: using this mount option may break other special tools that depend on the default semantics. > -- > Jeff Layton <jlayton@kernel.org>
On Fri, Oct 27, 2023 at 06:35:58AM -0400, Jeff Layton wrote: > On Thu, 2023-10-26 at 13:20 +1100, Dave Chinner wrote: > > On Wed, Oct 25, 2023 at 08:25:35AM -0400, Jeff Layton wrote: > > > On Wed, 2023-10-25 at 19:05 +1100, Dave Chinner wrote: > > > > On Tue, Oct 24, 2023 at 02:40:06PM -0400, Jeff Layton wrote: > > > In earlier discussions you alluded to some repair and/or analysis tools > > > that depended on this counter. > > > > Yes, and one of those "tools" is *me*. > > > > I frequently look at the di_changecount when doing forensic and/or > > failure analysis on filesystem corpses. SOE analysis, relative > > modification activity, etc all give insight into what happened to > > the filesystem to get it into the state it is currently in, and > > di_changecount provides information no other metadata in the inode > > contains. > > > > > I took a quick look in xfsprogs, but I > > > didn't see anything there. Is there a library or something that these > > > tools use to get at this value? > > > > xfs_db is the tool I use for this, such as: > > > > $ sudo xfs_db -c "sb 0" -c "a rootino" -c "p v3.change_count" /dev/mapper/fast > > v3.change_count = 35 > > $ > > > > The root inode in this filesystem has a change count of 35. The root > > inode has 32 dirents in it, which means that no entries have ever > > been removed or renamed. This sort of insight into the past history > > of inode metadata is largely impossible to get any other way, and > > it's been the difference between understanding failure and having no > > clue more than once. > > > > Most block device parsing applications simply write their own > > decoder that walks the on-disk format. That's pretty trivial to do, > > developers can get all the information needed to do this from the > > on-disk format specification documentation we keep on kernel.org... > > > > Fair enough. I'm not here to tell you that you guys that you need to > change how di_changecount works. If it's too valuable to keep it > counting atime-only updates, then so be it. > > If that's the case however, and given that the multigrain timestamp work > is effectively dead, then I don't see an alternative to growing the on- > disk inode. Do you? Yes, I do see alternatives. That's what I've been trying (unsuccessfully) to describe and get consensus on. I feel like I'm being ignored and rail-roaded here, because nobody is even acknowledging that I'm proposing alternatives and keeps insisting that the only solution is a change of on-disk format. So, I'll summarise the situation *yet again* in the hope that this time I won't get people arguing about atime vs i-version and what constitutes an on-disk format change because that goes nowhere and does nothing to determine which solution might be acceptible. The basic situation is this: If XFS can ignore relatime or lazytime persistent updates for given situations, then *we don't need to make periodic on-disk updates of atime*. This makes the whole problem of "persistent atime update bumps i_version" go away because then we *aren't making persistent atime updates* except when some other persistent modification that bumps [cm]time occurs. But I don't want to do this unconditionally - for systems not running anything that samples i_version we want relatime/lazytime to behave as they are supposed to and do periodic persistent updates as per normal. Principle of least surprise and all that jazz. So we really need an indication for inodes that we should enable this mode for the inode. I have asked if we can have per-operation context flag to trigger this given the needs for io_uring to have context flags for timestamp updates to be added. I have asked if we can have an inode flag set by the VFS or application code for this. e.g. a flag set by nfsd whenever it accesses a given inode. I have asked if this inode flag can just be triggered if we ever see I_VERSION_QUERIED set or statx is used to retrieve a change cookie, and whether this is a reliable mechanism for setting such a flag. I have suggested mechanisms for using masked off bits of timestamps to encode sub-timestamp granularity change counts and keep them invisible to userspace and then not using i_version at all for XFS. This avoids all the problems that the multi-grain timestamp infrastructure exposed due to variable granularity of user visible timestamps and ordering across inodes with different granularity. This is potentially a general solution, too. So, yeah, there are *lots* of ways we can solve this problem without needing to change on-disk formats. -Dave. -- Dave Chinner david@fromorbit.com
On Tue, 2023-10-31 at 09:37 +1100, Dave Chinner wrote: > On Fri, Oct 27, 2023 at 06:35:58AM -0400, Jeff Layton wrote: > > On Thu, 2023-10-26 at 13:20 +1100, Dave Chinner wrote: > > > On Wed, Oct 25, 2023 at 08:25:35AM -0400, Jeff Layton wrote: > > > > On Wed, 2023-10-25 at 19:05 +1100, Dave Chinner wrote: > > > > > On Tue, Oct 24, 2023 at 02:40:06PM -0400, Jeff Layton wrote: > > > > In earlier discussions you alluded to some repair and/or analysis tools > > > > that depended on this counter. > > > > > > Yes, and one of those "tools" is *me*. > > > > > > I frequently look at the di_changecount when doing forensic and/or > > > failure analysis on filesystem corpses. SOE analysis, relative > > > modification activity, etc all give insight into what happened to > > > the filesystem to get it into the state it is currently in, and > > > di_changecount provides information no other metadata in the inode > > > contains. > > > > > > > I took a quick look in xfsprogs, but I > > > > didn't see anything there. Is there a library or something that these > > > > tools use to get at this value? > > > > > > xfs_db is the tool I use for this, such as: > > > > > > $ sudo xfs_db -c "sb 0" -c "a rootino" -c "p v3.change_count" /dev/mapper/fast > > > v3.change_count = 35 > > > $ > > > > > > The root inode in this filesystem has a change count of 35. The root > > > inode has 32 dirents in it, which means that no entries have ever > > > been removed or renamed. This sort of insight into the past history > > > of inode metadata is largely impossible to get any other way, and > > > it's been the difference between understanding failure and having no > > > clue more than once. > > > > > > Most block device parsing applications simply write their own > > > decoder that walks the on-disk format. That's pretty trivial to do, > > > developers can get all the information needed to do this from the > > > on-disk format specification documentation we keep on kernel.org... > > > > > > > Fair enough. I'm not here to tell you that you guys that you need to > > change how di_changecount works. If it's too valuable to keep it > > counting atime-only updates, then so be it. > > > > If that's the case however, and given that the multigrain timestamp work > > is effectively dead, then I don't see an alternative to growing the on- > > disk inode. Do you? > > Yes, I do see alternatives. That's what I've been trying > (unsuccessfully) to describe and get consensus on. I feel like I'm > being ignored and rail-roaded here, because nobody is even > acknowledging that I'm proposing alternatives and keeps insisting > that the only solution is a change of on-disk format. > > So, I'll summarise the situation *yet again* in the hope that this > time I won't get people arguing about atime vs i-version and what > constitutes an on-disk format change because that goes nowhere and > does nothing to determine which solution might be acceptible. > > The basic situation is this: > > If XFS can ignore relatime or lazytime persistent updates for given > situations, then *we don't need to make periodic on-disk updates of > atime*. This makes the whole problem of "persistent atime update bumps > i_version" go away because then we *aren't making persistent atime > updates* except when some other persistent modification that bumps > [cm]time occurs. > > But I don't want to do this unconditionally - for systems not > running anything that samples i_version we want relatime/lazytime > to behave as they are supposed to and do periodic persistent updates > as per normal. Principle of least surprise and all that jazz. > > So we really need an indication for inodes that we should enable this > mode for the inode. I have asked if we can have per-operation > context flag to trigger this given the needs for io_uring to have > context flags for timestamp updates to be added. > > I have asked if we can have an inode flag set by the VFS or > application code for this. e.g. a flag set by nfsd whenever it accesses a > given inode. > > I have asked if this inode flag can just be triggered if we ever see > I_VERSION_QUERIED set or statx is used to retrieve a change cookie, > and whether this is a reliable mechanism for setting such a flag. > Ok, so to make sure I understand what you're proposing: This would be a new inode flag that would be set in conjunction with I_VERSION_QUERIED (but presumably is never cleared)? When XFS sees this flag set, it would skip sending the atime to disk. Given that you want to avoid on-disk changes, I assume this flag will not be stored on disk. What happens after the NFS server reboots? Consider: 1/ NFS server queries for the i_version and we set the I_NO_ATIME_UPDATES_ON_DISK flag (or whatever) in conjunction with I_VERSION_QUERIED. Some atime updates occur and the i_version isn't bumped (as you'd expect). 2/ The server then reboots. 3/ Server comes back up, and some local task issues a read against the inode. I_NO_ATIME_UPDATES_ON_DISK never had a chance to be set after the reboot, so that atime update ends up incrementing the i_version counter. 4/ client cache invalidation occurs even though there was no write to the file This might reduce some of the spurious i_version bumps, but I don't see how it can eliminate them entirely. > I have suggested mechanisms for using masked off bits of timestamps > to encode sub-timestamp granularity change counts and keep them > invisible to userspace and then not using i_version at all for XFS. > This avoids all the problems that the multi-grain timestamp > infrastructure exposed due to variable granularity of user visible > timestamps and ordering across inodes with different granularity. > This is potentially a general solution, too. > I don't really understand this at all, but trying to do anything with fine-grained timestamps will just run into a lot of the same problems we hit with the multigrain work. If you still see this as a path forward, maybe you can describe it more detail? > So, yeah, there are *lots* of ways we can solve this problem without > needing to change on-disk formats. > -- Jeff Layton <jlayton@kernel.org>
On Tue 31-10-23 07:04:53, Jeff Layton wrote: > On Tue, 2023-10-31 at 09:37 +1100, Dave Chinner wrote: > > I have suggested mechanisms for using masked off bits of timestamps > > to encode sub-timestamp granularity change counts and keep them > > invisible to userspace and then not using i_version at all for XFS. > > This avoids all the problems that the multi-grain timestamp > > infrastructure exposed due to variable granularity of user visible > > timestamps and ordering across inodes with different granularity. > > This is potentially a general solution, too. > > I don't really understand this at all, but trying to do anything with > fine-grained timestamps will just run into a lot of the same problems we > hit with the multigrain work. If you still see this as a path forward, > maybe you can describe it more detail? Dave explained a bit more details here [1] like: Another options is for XFS to play it's own internal tricks with [cm]time granularity and turn off i_version. e.g. limit external timestamp visibility to 1us and use the remaining dozen bits of the ns field to hold a change counter for updates within a single coarse timer tick. This guarantees the timestamp changes within a coarse tick for the purposes of change detection, but we don't expose those bits to applications so applications that compare timestamps across inodes won't get things back to front like was happening with the multi-grain timestamps.... - So as far as I understand Dave wants to effectively persist counter in low bits of ctime and expose ctime+counter as its change cookie. I guess that could work and what makes the complexity manageable compared to full multigrain timestamps is the fact that we have one filesystem, one on-disk format etc. The only slight trouble could be that if we previously handed out something in low bits of ctime for XFS, we need to keep handing the same thing out until the inode changes (i.e., no rounding until the moment inode changes) as the old timestamp could be stored somewhere externally and compared. Honza [1] https://lore.kernel.org/all/ZTjMRRqmlJ+fTys2@dread.disaster.area/ -- Jan Kara <jack@suse.com> SUSE Labs, CR
On Tue, 2023-10-31 at 13:22 +0100, Jan Kara wrote: > On Tue 31-10-23 07:04:53, Jeff Layton wrote: > > On Tue, 2023-10-31 at 09:37 +1100, Dave Chinner wrote: > > > I have suggested mechanisms for using masked off bits of timestamps > > > to encode sub-timestamp granularity change counts and keep them > > > invisible to userspace and then not using i_version at all for XFS. > > > This avoids all the problems that the multi-grain timestamp > > > infrastructure exposed due to variable granularity of user visible > > > timestamps and ordering across inodes with different granularity. > > > This is potentially a general solution, too. > > > > I don't really understand this at all, but trying to do anything with > > fine-grained timestamps will just run into a lot of the same problems we > > hit with the multigrain work. If you still see this as a path forward, > > maybe you can describe it more detail? > > Dave explained a bit more details here [1] like: > > Another options is for XFS to play it's own internal tricks with > [cm]time granularity and turn off i_version. e.g. limit external > timestamp visibility to 1us and use the remaining dozen bits of the > ns field to hold a change counter for updates within a single coarse > timer tick. This guarantees the timestamp changes within a coarse > tick for the purposes of change detection, but we don't expose those > bits to applications so applications that compare timestamps across > inodes won't get things back to front like was happening with the > multi-grain timestamps.... > - > > So as far as I understand Dave wants to effectively persist counter in low > bits of ctime and expose ctime+counter as its change cookie. I guess that > could work and what makes the complexity manageable compared to full > multigrain timestamps is the fact that we have one filesystem, one on-disk > format etc. The only slight trouble could be that if we previously handed > out something in low bits of ctime for XFS, we need to keep handing the > same thing out until the inode changes (i.e., no rounding until the moment > inode changes) as the old timestamp could be stored somewhere externally > and compared. > > Honza > > [1] https://lore.kernel.org/all/ZTjMRRqmlJ+fTys2@dread.disaster.area/ > > Got it. That makes sense and could probably be made to work. Doing that all in XFS won't be simple though. You'll need to reimplement stuff like file_modified() and file_update_time(). Those get called from deep within the VFS and from page fault handlers. FWIW, that's the main reason the multigrain work was so invasive, even though it was a filesystem-specific feature. -- Jeff Layton <jlayton@kernel.org>
On Mon, 30 Oct 2023 at 12:37, Dave Chinner <david@fromorbit.com> wrote:
>
> If XFS can ignore relatime or lazytime persistent updates for given
> situations, then *we don't need to make periodic on-disk updates of
> atime*. This makes the whole problem of "persistent atime update bumps
> i_version" go away because then we *aren't making persistent atime
> updates* except when some other persistent modification that bumps
> [cm]time occurs.
Well, I think this should be split into two independent questions:
(a) are relatime or lazytime atime updates persistent if nothing else changes?
(b) do atime updates _ever_ update i_version *regardless* of relatime
or lazytime?
and honestly, I think the best answer to (b) would be that "no,
i_version should simply not change for atime updates". And I think
that answer is what it is because no user of i_version seems to want
it.
Now, the reason it's a single question for you is that apparently for
XFS, the only thing that matters is "inode was written to disk" and
that "di_changecount" value is thus related to the persistence of
atime updates, but splitting di_changecount out to be a separate thing
from i_version seems to be on the table, so I think those two things
really could be independent issues.
> But I don't want to do this unconditionally - for systems not
> running anything that samples i_version we want relatime/lazytime
> to behave as they are supposed to and do periodic persistent updates
> as per normal. Principle of least surprise and all that jazz.
Well - see above: I think in a perfect world, we'd simply never change
i_version at all for any atime updates, and relatime/lazytime simply
wouldn't be an issue at all wrt i_version.
Wouldn't _that_ be the trule "least surprising" behavior? Considering
that nobody wants i_version to change for what are otherwise pure
reads (that's kind of the *definition* of atime, after all).
Now, the annoyance here is that *both* (a) and (b) then have that
impact of "i_version no longer tracks di_changecount".
So I don't think the issue here is "i_version" per se. I think in a
vacuum, the best option of i_version is pretty obvious. But if you
want i_version to track di_changecount, *then* you end up with that
situation where the persistence of atime matters, and i_version needs
to update whenever a (persistent) atime update happens.
> So we really need an indication for inodes that we should enable this
> mode for the inode. I have asked if we can have per-operation
> context flag to trigger this given the needs for io_uring to have
> context flags for timestamp updates to be added.
I really think some kind of new and even *more* complex and
non-intuitive behavior is the worst of both worlds. Having atime
updates be conditionally persistent - on top of already being delayed
by lazytime/relatime - and having the persistence magically change
depending on whether something wants to get i_version updates - sounds
just completely crazy.
Particularly as *none* of the people who want i_version updates
actually want them for atime at all.
So I really think this all boils down to "is xfs really willing to
split bi_changecount from i_version"?
> I have asked if we can have an inode flag set by the VFS or
> application code for this. e.g. a flag set by nfsd whenever it accesses a
> given inode.
>
> I have asked if this inode flag can just be triggered if we ever see
> I_VERSION_QUERIED set or statx is used to retrieve a change cookie,
> and whether this is a reliable mechanism for setting such a flag.
See above: linking this to I_VERSION_QUERIED is horrific. The people
who set that bit do *NOT* want atime updates to change i_version under
any circumstances. It was always a mistake.
This really is all *entirely* an artifact of that "bi_changecount" vs
"i_version" being tied together. You did seem to imply that you'd be
ok with having "bi_changecount" be split from i_version, ie from an
earlier email in this thread:
"Now that NFS is using a proper abstraction (i.e. vfs_statx()) to get
the change cookie, we really don't need to expose di_changecount in
i_version at all - we could simply copy an internal di_changecount
value into the statx cookie field in xfs_vn_getattr() and there
would be almost no change of behaviour from the perspective of NFS
and IMA at all"
but while I suspect *that* part is easy and straightforward, the
problem then becomes one of "what about the persistence of i_version",
and then you'd need a new field for *that* anyway, and would want a
new on-disk format regardless.
Linus
On Mon, Oct 30, 2023 at 01:11:56PM -1000, Linus Torvalds wrote: > On Mon, 30 Oct 2023 at 12:37, Dave Chinner <david@fromorbit.com> wrote: > > > > If XFS can ignore relatime or lazytime persistent updates for given > > situations, then *we don't need to make periodic on-disk updates of > > atime*. This makes the whole problem of "persistent atime update bumps > > i_version" go away because then we *aren't making persistent atime > > updates* except when some other persistent modification that bumps > > [cm]time occurs. > > Well, I think this should be split into two independent questions: > > (a) are relatime or lazytime atime updates persistent if nothing else changes? They only become persistent after 24 hours or, in the case of relatime, immediately persistent if mtime < atime (i.e. read after a modification). Those are the only times that the VFS triggers persistent writeback of atime, and it's the latter case (mtime < atime) that is the specific trigger that exposed the problem with atime bumping i_version in the first place. > (b) do atime updates _ever_ update i_version *regardless* of relatime > or lazytime? > > and honestly, I think the best answer to (b) would be that "no, > i_version should simply not change for atime updates". And I think > that answer is what it is because no user of i_version seems to want > it. As I keep repeating: Repeatedly stating that "atime should not bump i_version" does not address the questions I'm asking *at all*. > Now, the reason it's a single question for you is that apparently for > XFS, the only thing that matters is "inode was written to disk" and > that "di_changecount" value is thus related to the persistence of > atime updates, but splitting di_changecount out to be a separate thing > from i_version seems to be on the table, so I think those two things > really could be independent issues. Wrong way around - we'd have to split i_version out from di_changecount. It's i_version that has changed semantics, not di_changecount, and di_changecount behaviour must remain unchanged. What I really don't want to do is implement a new i_version field in the XFS on-disk format. What this redefinition of i_version semantics has made clear is that i_version is *user defined metadata*, not internal filesystem metadata that is defined by the filesystem on-disk format. User defined persistent metadata *belongs in xattrs*, not in the core filesystem on-disk formats. If the VFS wants to define and manage i_version behaviour, smeantics and persistence independently of the filesystems that manage the persistent storage (as it clearly does!) then we should treat it just like any other VFS defined inode metadata (e.g. per inode objects like security constraints, ACLs, fsverity digests, fscrypt keys, etc). i.e. it should be in a named xattr, not directly implemented in the filesystem on-disk format deinfitions. Then the application can change the meaning of the metadata whenever and however it likes. Then filesystem developers just don't need to care about it at all because the VFS specific persistent metadata is not part of the on-disk format we need to maintain cross-platform forwards and backwards compatibility for. > > But I don't want to do this unconditionally - for systems not > > running anything that samples i_version we want relatime/lazytime > > to behave as they are supposed to and do periodic persistent updates > > as per normal. Principle of least surprise and all that jazz. > > Well - see above: I think in a perfect world, we'd simply never change > i_version at all for any atime updates, and relatime/lazytime simply > wouldn't be an issue at all wrt i_version. Right, that's what I'd like, especially as the new definition of i_version - "only change when [cm]time changes" - means that the VFS i_version is really now just a glorified timestamp. > Wouldn't _that_ be the trule "least surprising" behavior? Considering > that nobody wants i_version to change for what are otherwise pure > reads (that's kind of the *definition* of atime, after all). So, if you don't like the idea of us ignoring relatime/lazytime conditionally, are we allowed to simply ignore them *all the time* and do all timestamp updates in the manner that causes users the least amount of pain? I mean, relatime only exists because atime updates cause users pain. lazytime only exists because relatime doesn't address the pain that timestamp updates cause mixed read/write or pure O_DSYNC overwrite workloads pain. noatime is a pain because it loses all atime updates. There is no "one size is right for everyone", so why not just let filesystems do what is most efficient from an internal IO and persistence POV whilst still maintaining the majority of "expected" behaviours? Keep in mind, though, that this is all moot if we can get rid of i_version entirely.... > Now, the annoyance here is that *both* (a) and (b) then have that > impact of "i_version no longer tracks di_changecount". .... and what is annoying is that that the new i_version just a glorified ctime change counter. What we should be fixing is ctime - integrating this change counting into ctime would allow us to make i_version go away entirely. i.e. We don't need a persistent ctime change counter if the ctime has sufficient resolution or persistent encoding that it does not need an external persistent change counter. That was reasoning behind the multi-grain timestamps. While the mgts implementation was flawed, the reasoning behind it certainly isn't. We should be trying to get rid of i_version by integrating it into ctime updates, not arguing how atime vs i_version should work. > So I don't think the issue here is "i_version" per se. I think in a > vacuum, the best option of i_version is pretty obvious. But if you > want i_version to track di_changecount, *then* you end up with that > situation where the persistence of atime matters, and i_version needs > to update whenever a (persistent) atime update happens. Yet I don't want i_version to track di_changecount. I want to *stop supporting i_version altogether* in XFS. I want i_version as filesystem internal metadata to die completely. I don't want to change the on disk format to add a new i_version field because we'll be straight back in this same siutation when the next i_version bug is found and semantics get changed yet again. Hence if we can encode the necessary change attributes into ctime, we can drop VFS i_version support altogether. Then the "atime bumps i_version" problem also goes away because then we *don't use i_version*. But if we can't get the VFS to do this with ctime, at least we have the abstractions available to us (i.e. timestamp granularity and statx change cookie) to allow XFS to implement this sort of ctime-with-integrated-change-counter internally to the filesystem and be able to drop i_version support.... [....] > This really is all *entirely* an artifact of that "bi_changecount" vs > "i_version" being tied together. You did seem to imply that you'd be > ok with having "bi_changecount" be split from i_version, ie from an > earlier email in this thread: > > "Now that NFS is using a proper abstraction (i.e. vfs_statx()) to get > the change cookie, we really don't need to expose di_changecount in > i_version at all - we could simply copy an internal di_changecount > value into the statx cookie field in xfs_vn_getattr() and there > would be almost no change of behaviour from the perspective of NFS > and IMA at all" .... which is what I was talking about here. i.e. I was not talking about splitting i_version from di_changecount - I was talking about being able to stop supporting the VFS i_version counter entirely and still having NFS and IMA work correctly. Continually bring the argument back to "atime vs i_version" misses the bigger issues around this new i_version definition and implementation, and that the real solution should be to fix ctime updates to make i_version at the VFS level go away forever. -Dave. -- Dave Chinner david@fromorbit.com
On Tue, 2023-10-31 at 12:42 +1100, Dave Chinner wrote: > On Mon, Oct 30, 2023 at 01:11:56PM -1000, Linus Torvalds wrote: > > On Mon, 30 Oct 2023 at 12:37, Dave Chinner <david@fromorbit.com> wrote: > > > > > > If XFS can ignore relatime or lazytime persistent updates for given > > > situations, then *we don't need to make periodic on-disk updates of > > > atime*. This makes the whole problem of "persistent atime update bumps > > > i_version" go away because then we *aren't making persistent atime > > > updates* except when some other persistent modification that bumps > > > [cm]time occurs. > > > > Well, I think this should be split into two independent questions: > > > > (a) are relatime or lazytime atime updates persistent if nothing else changes? > > They only become persistent after 24 hours or, in the case of > relatime, immediately persistent if mtime < atime (i.e. read after a > modification). Those are the only times that the VFS triggers > persistent writeback of atime, and it's the latter case (mtime < > atime) that is the specific trigger that exposed the problem with > atime bumping i_version in the first place. > > > (b) do atime updates _ever_ update i_version *regardless* of relatime > > or lazytime? > > > > and honestly, I think the best answer to (b) would be that "no, > > i_version should simply not change for atime updates". And I think > > that answer is what it is because no user of i_version seems to want > > it. > > As I keep repeating: Repeatedly stating that "atime should not bump > i_version" does not address the questions I'm asking *at all*. > > > Now, the reason it's a single question for you is that apparently for > > XFS, the only thing that matters is "inode was written to disk" and > > that "di_changecount" value is thus related to the persistence of > > atime updates, but splitting di_changecount out to be a separate thing > > from i_version seems to be on the table, so I think those two things > > really could be independent issues. > > Wrong way around - we'd have to split i_version out from > di_changecount. It's i_version that has changed semantics, not > di_changecount, and di_changecount behaviour must remain unchanged. > I have to take issue with your characterization of this. The requirements for NFS's change counter have not changed. Clearly there was a breakdown in communications when it was first implemented in Linux that caused atime updates to get counted in the i_version value, but that was never intentional and never by design. I'm simply trying to correct this historical mistake. > What I really don't want to do is implement a new i_version field in > the XFS on-disk format. What this redefinition of i_version > semantics has made clear is that i_version is *user defined > metadata*, not internal filesystem metadata that is defined by the > filesystem on-disk format. > > User defined persistent metadata *belongs in xattrs*, not in the > core filesystem on-disk formats. If the VFS wants to define and > manage i_version behaviour, smeantics and persistence independently > of the filesystems that manage the persistent storage (as it clearly > does!) then we should treat it just like any other VFS defined inode > metadata (e.g. per inode objects like security constraints, ACLs, > fsverity digests, fscrypt keys, etc). i.e. it should be in a named > xattr, not directly implemented in the filesystem on-disk format > deinfitions. > > Then the application can change the meaning of the metadata whenever > and however it likes. Then filesystem developers just don't need > to care about it at all because the VFS specific persistent metadata > is not part of the on-disk format we need to maintain > cross-platform forwards and backwards compatibility for. > > > > But I don't want to do this unconditionally - for systems not > > > running anything that samples i_version we want relatime/lazytime > > > to behave as they are supposed to and do periodic persistent updates > > > as per normal. Principle of least surprise and all that jazz. > > > > Well - see above: I think in a perfect world, we'd simply never change > > i_version at all for any atime updates, and relatime/lazytime simply > > wouldn't be an issue at all wrt i_version. > > Right, that's what I'd like, especially as the new definition of > i_version - "only change when [cm]time changes" - means that the VFS > i_version is really now just a glorified timestamp. > > > Wouldn't _that_ be the trule "least surprising" behavior? Considering > > that nobody wants i_version to change for what are otherwise pure > > reads (that's kind of the *definition* of atime, after all). > > So, if you don't like the idea of us ignoring relatime/lazytime > conditionally, are we allowed to simply ignore them *all the time* > and do all timestamp updates in the manner that causes users the > least amount of pain? > > I mean, relatime only exists because atime updates cause users pain. > lazytime only exists because relatime doesn't address the pain that > timestamp updates cause mixed read/write or pure O_DSYNC overwrite > workloads pain. noatime is a pain because it loses all atime > updates. > > There is no "one size is right for everyone", so why not just let > filesystems do what is most efficient from an internal IO and > persistence POV whilst still maintaining the majority of "expected" > behaviours? > > Keep in mind, though, that this is all moot if we can get rid of > i_version entirely.... > > > Now, the annoyance here is that *both* (a) and (b) then have that > > impact of "i_version no longer tracks di_changecount". > > .... and what is annoying is that that the new i_version just a > glorified ctime change counter. What we should be fixing is ctime - > integrating this change counting into ctime would allow us to make > i_version go away entirely. i.e. We don't need a persistent ctime > change counter if the ctime has sufficient resolution or persistent > encoding that it does not need an external persistent change > counter. > > That was reasoning behind the multi-grain timestamps. While the mgts > implementation was flawed, the reasoning behind it certainly isn't. > We should be trying to get rid of i_version by integrating it into > ctime updates, not arguing how atime vs i_version should work. > > > So I don't think the issue here is "i_version" per se. I think in a > > vacuum, the best option of i_version is pretty obvious. But if you > > want i_version to track di_changecount, *then* you end up with that > > situation where the persistence of atime matters, and i_version needs > > to update whenever a (persistent) atime update happens. > > Yet I don't want i_version to track di_changecount. > > I want to *stop supporting i_version altogether* in XFS. > > I want i_version as filesystem internal metadata to die completely. > > I don't want to change the on disk format to add a new i_version > field because we'll be straight back in this same siutation when the > next i_version bug is found and semantics get changed yet again. > > Hence if we can encode the necessary change attributes into ctime, > we can drop VFS i_version support altogether. Then the "atime bumps > i_version" problem also goes away because then we *don't use > i_version*. > > But if we can't get the VFS to do this with ctime, at least we have > the abstractions available to us (i.e. timestamp granularity and > statx change cookie) to allow XFS to implement this sort of > ctime-with-integrated-change-counter internally to the filesystem > and be able to drop i_version support.... > > [....] > > This really is all *entirely* an artifact of that "bi_changecount" vs > > "i_version" being tied together. You did seem to imply that you'd be > > ok with having "bi_changecount" be split from i_version, ie from an > > earlier email in this thread: > > > > "Now that NFS is using a proper abstraction (i.e. vfs_statx()) to get > > the change cookie, we really don't need to expose di_changecount in > > i_version at all - we could simply copy an internal di_changecount > > value into the statx cookie field in xfs_vn_getattr() and there > > would be almost no change of behaviour from the perspective of NFS > > and IMA at all" > > .... which is what I was talking about here. > > i.e. I was not talking about splitting i_version from di_changecount > - I was talking about being able to stop supporting the VFS > i_version counter entirely and still having NFS and IMA work > correctly. > > Continually bring the argument back to "atime vs i_version" misses > the bigger issues around this new i_version definition and > implementation, and that the real solution should be to fix ctime > updates to make i_version at the VFS level go away forever. > Ok, so your proposal is to keep using coarse grained timestamps, but use the "insignificant" bits in them to store a change counter? That sounds complex and fraught with peril, but I'm willing to listen to some specifics about how that would work. -- Jeff Layton <jlayton@kernel.org>
>>>>> "Jeff" == Jeff Layton <jlayton@kernel.org> writes: > On Tue, 2023-10-31 at 12:42 +1100, Dave Chinner wrote: >> On Mon, Oct 30, 2023 at 01:11:56PM -1000, Linus Torvalds wrote: >> > On Mon, 30 Oct 2023 at 12:37, Dave Chinner <david@fromorbit.com> wrote: >> > > >> > > If XFS can ignore relatime or lazytime persistent updates for given >> > > situations, then *we don't need to make periodic on-disk updates of >> > > atime*. This makes the whole problem of "persistent atime update bumps >> > > i_version" go away because then we *aren't making persistent atime >> > > updates* except when some other persistent modification that bumps >> > > [cm]time occurs. >> > >> > Well, I think this should be split into two independent questions: >> > >> > (a) are relatime or lazytime atime updates persistent if nothing else changes? >> >> They only become persistent after 24 hours or, in the case of >> relatime, immediately persistent if mtime < atime (i.e. read after a >> modification). Those are the only times that the VFS triggers >> persistent writeback of atime, and it's the latter case (mtime < >> atime) that is the specific trigger that exposed the problem with >> atime bumping i_version in the first place. >> >> > (b) do atime updates _ever_ update i_version *regardless* of relatime >> > or lazytime? >> > >> > and honestly, I think the best answer to (b) would be that "no, >> > i_version should simply not change for atime updates". And I think >> > that answer is what it is because no user of i_version seems to want >> > it. >> >> As I keep repeating: Repeatedly stating that "atime should not bump >> i_version" does not address the questions I'm asking *at all*. >> >> > Now, the reason it's a single question for you is that apparently for >> > XFS, the only thing that matters is "inode was written to disk" and >> > that "di_changecount" value is thus related to the persistence of >> > atime updates, but splitting di_changecount out to be a separate thing >> > from i_version seems to be on the table, so I think those two things >> > really could be independent issues. >> >> Wrong way around - we'd have to split i_version out from >> di_changecount. It's i_version that has changed semantics, not >> di_changecount, and di_changecount behaviour must remain unchanged. >> > I have to take issue with your characterization of this. The > requirements for NFS's change counter have not changed. Clearly there > was a breakdown in communications when it was first implemented in Linux > that caused atime updates to get counted in the i_version value, but > that was never intentional and never by design. This has been bugging me, but all the references to NFS really mean NFSv4.1 or newer, correct? I can't see how any of this affects NFSv3 at all, and that's probably the still dominant form of NFS, right? John
On Tue, Oct 31, 2023 at 3:42 AM Dave Chinner <david@fromorbit.com> wrote: > [...] > .... and what is annoying is that that the new i_version just a > glorified ctime change counter. What we should be fixing is ctime - > integrating this change counting into ctime would allow us to make > i_version go away entirely. i.e. We don't need a persistent ctime > change counter if the ctime has sufficient resolution or persistent > encoding that it does not need an external persistent change > counter. > > That was reasoning behind the multi-grain timestamps. While the mgts > implementation was flawed, the reasoning behind it certainly isn't. > We should be trying to get rid of i_version by integrating it into > ctime updates, not arguing how atime vs i_version should work. > > > So I don't think the issue here is "i_version" per se. I think in a > > vacuum, the best option of i_version is pretty obvious. But if you > > want i_version to track di_changecount, *then* you end up with that > > situation where the persistence of atime matters, and i_version needs > > to update whenever a (persistent) atime update happens. > > Yet I don't want i_version to track di_changecount. > > I want to *stop supporting i_version altogether* in XFS. > > I want i_version as filesystem internal metadata to die completely. > > I don't want to change the on disk format to add a new i_version > field because we'll be straight back in this same siutation when the > next i_version bug is found and semantics get changed yet again. > > Hence if we can encode the necessary change attributes into ctime, > we can drop VFS i_version support altogether. Then the "atime bumps > i_version" problem also goes away because then we *don't use > i_version*. > > But if we can't get the VFS to do this with ctime, at least we have > the abstractions available to us (i.e. timestamp granularity and > statx change cookie) to allow XFS to implement this sort of > ctime-with-integrated-change-counter internally to the filesystem > and be able to drop i_version support.... > I don't know if it was mentioned before in one of the many threads, but there is another benefit of ctime-with-integrated-change-counter approach - it is the ability to extend the solution with some adaptations also to mtime. The "change cookie" is used to know if inode metadata cache should be invalidated and mtime is often used to know if data cache should be invalidated, or if data comparison could be skipped (e.g. rsync). The difference is that mtime can be set by user, so using lower nsec bits for modification counter would require to truncate the user set time granularity to 100ns - that is probably acceptable, but only as an opt-in behavior. The special value 0 for mtime-change-counter could be reserved for mtime that was set by the user or for upgrade of existing inode, where 0 counter means that mtime cannot be trusted as an accurate data modification-cookie. This feature is going to be useful for the vfs HSM implementation [1] that I am working on and it actually rhymes with the XFS DMAPI patches that were never fully merged upstream. Speaking on behalf of my employer, we would love to see the data modification-cookie feature implemented, whether in vfs or in xfs. *IF* the result on this thread is that the chosen solution is ctime-with-change-counter in XFS *AND* if there is agreement among XFS developers to extend it with an opt-in mkfs/mount option to 100ns-mtime-with-change-counter in XFS *THEN* I think I will be able to allocate resources to drive this xfs work. Thanks, Amir. [1] https://github.com/amir73il/fsnotify-utils/wiki/Hierarchical-Storage-Management-API
On Tue, Oct 31, 2023 at 09:03:57AM +0200, Amir Goldstein wrote: > On Tue, Oct 31, 2023 at 3:42 AM Dave Chinner <david@fromorbit.com> wrote: > > > [...] > > .... and what is annoying is that that the new i_version just a > > glorified ctime change counter. What we should be fixing is ctime - > > integrating this change counting into ctime would allow us to make > > i_version go away entirely. i.e. We don't need a persistent ctime > > change counter if the ctime has sufficient resolution or persistent > > encoding that it does not need an external persistent change > > counter. > > > > That was reasoning behind the multi-grain timestamps. While the mgts > > implementation was flawed, the reasoning behind it certainly isn't. > > We should be trying to get rid of i_version by integrating it into > > ctime updates, not arguing how atime vs i_version should work. > > > > > So I don't think the issue here is "i_version" per se. I think in a > > > vacuum, the best option of i_version is pretty obvious. But if you > > > want i_version to track di_changecount, *then* you end up with that > > > situation where the persistence of atime matters, and i_version needs > > > to update whenever a (persistent) atime update happens. > > > > Yet I don't want i_version to track di_changecount. > > > > I want to *stop supporting i_version altogether* in XFS. > > > > I want i_version as filesystem internal metadata to die completely. > > > > I don't want to change the on disk format to add a new i_version > > field because we'll be straight back in this same siutation when the > > next i_version bug is found and semantics get changed yet again. > > > > Hence if we can encode the necessary change attributes into ctime, > > we can drop VFS i_version support altogether. Then the "atime bumps > > i_version" problem also goes away because then we *don't use > > i_version*. > > > > But if we can't get the VFS to do this with ctime, at least we have > > the abstractions available to us (i.e. timestamp granularity and > > statx change cookie) to allow XFS to implement this sort of > > ctime-with-integrated-change-counter internally to the filesystem > > and be able to drop i_version support.... > > > > I don't know if it was mentioned before in one of the many threads, > but there is another benefit of ctime-with-integrated-change-counter > approach - it is the ability to extend the solution with some adaptations > also to mtime. > > The "change cookie" is used to know if inode metadata cache should > be invalidated and mtime is often used to know if data cache should > be invalidated, or if data comparison could be skipped (e.g. rsync). > > The difference is that mtime can be set by user, so using lower nsec > bits for modification counter would require to truncate the user set > time granularity to 100ns - that is probably acceptable, but only as > an opt-in behavior. > > The special value 0 for mtime-change-counter could be reserved for > mtime that was set by the user or for upgrade of existing inode, > where 0 counter means that mtime cannot be trusted as an accurate > data modification-cookie. What about write faults on an mmap region? The first ro->rw transition results in an mtime update, but not again until the page gets cleaned. > This feature is going to be useful for the vfs HSM implementation [1] > that I am working on and it actually rhymes with the XFS DMAPI > patches that were never fully merged upstream. Kudos, I cannot figure out a non-pejorative word that rhymes with "**API". ;) --D > Speaking on behalf of my employer, we would love to see the data > modification-cookie feature implemented, whether in vfs or in xfs. > > *IF* the result on this thread is that the chosen solution is > ctime-with-change-counter in XFS > *AND* if there is agreement among XFS developers to extend it with > an opt-in mkfs/mount option to 100ns-mtime-with-change-counter in XFS > *THEN* I think I will be able to allocate resources to drive this xfs work. > > Thanks, > Amir. > > [1] https://github.com/amir73il/fsnotify-utils/wiki/Hierarchical-Storage-Management-API >
On Wed, Nov 1, 2023 at 1:12 AM Darrick J. Wong <djwong@kernel.org> wrote: > > On Tue, Oct 31, 2023 at 09:03:57AM +0200, Amir Goldstein wrote: > > On Tue, Oct 31, 2023 at 3:42 AM Dave Chinner <david@fromorbit.com> wrote: > > > > > [...] > > > .... and what is annoying is that that the new i_version just a > > > glorified ctime change counter. What we should be fixing is ctime - > > > integrating this change counting into ctime would allow us to make > > > i_version go away entirely. i.e. We don't need a persistent ctime > > > change counter if the ctime has sufficient resolution or persistent > > > encoding that it does not need an external persistent change > > > counter. > > > > > > That was reasoning behind the multi-grain timestamps. While the mgts > > > implementation was flawed, the reasoning behind it certainly isn't. > > > We should be trying to get rid of i_version by integrating it into > > > ctime updates, not arguing how atime vs i_version should work. > > > > > > > So I don't think the issue here is "i_version" per se. I think in a > > > > vacuum, the best option of i_version is pretty obvious. But if you > > > > want i_version to track di_changecount, *then* you end up with that > > > > situation where the persistence of atime matters, and i_version needs > > > > to update whenever a (persistent) atime update happens. > > > > > > Yet I don't want i_version to track di_changecount. > > > > > > I want to *stop supporting i_version altogether* in XFS. > > > > > > I want i_version as filesystem internal metadata to die completely. > > > > > > I don't want to change the on disk format to add a new i_version > > > field because we'll be straight back in this same siutation when the > > > next i_version bug is found and semantics get changed yet again. > > > > > > Hence if we can encode the necessary change attributes into ctime, > > > we can drop VFS i_version support altogether. Then the "atime bumps > > > i_version" problem also goes away because then we *don't use > > > i_version*. > > > > > > But if we can't get the VFS to do this with ctime, at least we have > > > the abstractions available to us (i.e. timestamp granularity and > > > statx change cookie) to allow XFS to implement this sort of > > > ctime-with-integrated-change-counter internally to the filesystem > > > and be able to drop i_version support.... > > > > > > > I don't know if it was mentioned before in one of the many threads, > > but there is another benefit of ctime-with-integrated-change-counter > > approach - it is the ability to extend the solution with some adaptations > > also to mtime. > > > > The "change cookie" is used to know if inode metadata cache should > > be invalidated and mtime is often used to know if data cache should > > be invalidated, or if data comparison could be skipped (e.g. rsync). > > > > The difference is that mtime can be set by user, so using lower nsec > > bits for modification counter would require to truncate the user set > > time granularity to 100ns - that is probably acceptable, but only as > > an opt-in behavior. > > > > The special value 0 for mtime-change-counter could be reserved for > > mtime that was set by the user or for upgrade of existing inode, > > where 0 counter means that mtime cannot be trusted as an accurate > > data modification-cookie. > > What about write faults on an mmap region? The first ro->rw transition > results in an mtime update, but not again until the page gets cleaned. > That problem is out of scope. As is the issue of whether mtime is updated before or after the data modification. For all practical matter, HSM (or any backup/sync software) could fsync data of file before testing its mtime. I am working on an additional mechanism (sb_start_write_srcu) which HSM can use to close the rest of the possible TOCTOU races. The problem that modification-cookie needs to solve is the coarse grained mtime timestamp, very much like change-cookie for ctime and additionally it needs to solve the problem that HSM needs to know if the mtime timestamp was generated by the filesystem or set by the user. Thanks, Amir.
On Tue, 2023-10-31 at 09:03 +0200, Amir Goldstein wrote: > On Tue, Oct 31, 2023 at 3:42 AM Dave Chinner <david@fromorbit.com> wrote: > > > [...] > > .... and what is annoying is that that the new i_version just a > > glorified ctime change counter. What we should be fixing is ctime - > > integrating this change counting into ctime would allow us to make > > i_version go away entirely. i.e. We don't need a persistent ctime > > change counter if the ctime has sufficient resolution or persistent > > encoding that it does not need an external persistent change > > counter. > > > > That was reasoning behind the multi-grain timestamps. While the mgts > > implementation was flawed, the reasoning behind it certainly isn't. > > We should be trying to get rid of i_version by integrating it into > > ctime updates, not arguing how atime vs i_version should work. > > > > > So I don't think the issue here is "i_version" per se. I think in a > > > vacuum, the best option of i_version is pretty obvious. But if you > > > want i_version to track di_changecount, *then* you end up with that > > > situation where the persistence of atime matters, and i_version needs > > > to update whenever a (persistent) atime update happens. > > > > Yet I don't want i_version to track di_changecount. > > > > I want to *stop supporting i_version altogether* in XFS. > > > > I want i_version as filesystem internal metadata to die completely. > > > > I don't want to change the on disk format to add a new i_version > > field because we'll be straight back in this same siutation when the > > next i_version bug is found and semantics get changed yet again. > > > > Hence if we can encode the necessary change attributes into ctime, > > we can drop VFS i_version support altogether. Then the "atime bumps > > i_version" problem also goes away because then we *don't use > > i_version*. > > > > But if we can't get the VFS to do this with ctime, at least we have > > the abstractions available to us (i.e. timestamp granularity and > > statx change cookie) to allow XFS to implement this sort of > > ctime-with-integrated-change-counter internally to the filesystem > > and be able to drop i_version support.... > > > > I don't know if it was mentioned before in one of the many threads, > but there is another benefit of ctime-with-integrated-change-counter > approach - it is the ability to extend the solution with some adaptations > also to mtime. > > The "change cookie" is used to know if inode metadata cache should > be invalidated and mtime is often used to know if data cache should > be invalidated, or if data comparison could be skipped (e.g. rsync). > > The difference is that mtime can be set by user, so using lower nsec > bits for modification counter would require to truncate the user set > time granularity to 100ns - that is probably acceptable, but only as > an opt-in behavior. > > The special value 0 for mtime-change-counter could be reserved for > mtime that was set by the user or for upgrade of existing inode, > where 0 counter means that mtime cannot be trusted as an accurate > data modification-cookie. > > This feature is going to be useful for the vfs HSM implementation [1] > that I am working on and it actually rhymes with the XFS DMAPI > patches that were never fully merged upstream. > > Speaking on behalf of my employer, we would love to see the data > modification-cookie feature implemented, whether in vfs or in xfs. > > *IF* the result on this thread is that the chosen solution is > ctime-with-change-counter in XFS > *AND* if there is agreement among XFS developers to extend it with > an opt-in mkfs/mount option to 100ns-mtime-with-change-counter in XFS > *THEN* I think I will be able to allocate resources to drive this xfs work. > > Thanks, > Amir. > > [1] https://github.com/amir73il/fsnotify-utils/wiki/Hierarchical-Storage-Management-API I agree with Christian that if you can do this inside of XFS altogether, then that's a preferable solution. I don't quite understand how ctime- with-change-counter is intended to work however, so I'll have to reserve judgement there. -- Jeff Layton <jlayton@kernel.org>
On Tue, Oct 31, 2023 at 09:03:57AM +0200, Amir Goldstein wrote: > On Tue, Oct 31, 2023 at 3:42 AM Dave Chinner <david@fromorbit.com> wrote: > > > [...] > > .... and what is annoying is that that the new i_version just a > > glorified ctime change counter. What we should be fixing is ctime - > > integrating this change counting into ctime would allow us to make > > i_version go away entirely. i.e. We don't need a persistent ctime > > change counter if the ctime has sufficient resolution or persistent > > encoding that it does not need an external persistent change > > counter. > > > > That was reasoning behind the multi-grain timestamps. While the mgts > > implementation was flawed, the reasoning behind it certainly isn't. > > We should be trying to get rid of i_version by integrating it into > > ctime updates, not arguing how atime vs i_version should work. > > > > > So I don't think the issue here is "i_version" per se. I think in a > > > vacuum, the best option of i_version is pretty obvious. But if you > > > want i_version to track di_changecount, *then* you end up with that > > > situation where the persistence of atime matters, and i_version needs > > > to update whenever a (persistent) atime update happens. > > > > Yet I don't want i_version to track di_changecount. > > > > I want to *stop supporting i_version altogether* in XFS. > > > > I want i_version as filesystem internal metadata to die completely. > > > > I don't want to change the on disk format to add a new i_version > > field because we'll be straight back in this same siutation when the > > next i_version bug is found and semantics get changed yet again. > > > > Hence if we can encode the necessary change attributes into ctime, > > we can drop VFS i_version support altogether. Then the "atime bumps > > i_version" problem also goes away because then we *don't use > > i_version*. > > > > But if we can't get the VFS to do this with ctime, at least we have > > the abstractions available to us (i.e. timestamp granularity and > > statx change cookie) to allow XFS to implement this sort of > > ctime-with-integrated-change-counter internally to the filesystem > > and be able to drop i_version support.... > > > > I don't know if it was mentioned before in one of the many threads, > but there is another benefit of ctime-with-integrated-change-counter > approach - it is the ability to extend the solution with some adaptations > also to mtime. > > The "change cookie" is used to know if inode metadata cache should > be invalidated and mtime is often used to know if data cache should > be invalidated, or if data comparison could be skipped (e.g. rsync). > > The difference is that mtime can be set by user, so using lower nsec > bits for modification counter would require to truncate the user set > time granularity to 100ns - that is probably acceptable, but only as > an opt-in behavior. > > The special value 0 for mtime-change-counter could be reserved for > mtime that was set by the user or for upgrade of existing inode, > where 0 counter means that mtime cannot be trusted as an accurate > data modification-cookie. > > This feature is going to be useful for the vfs HSM implementation [1] > that I am working on and it actually rhymes with the XFS DMAPI > patches that were never fully merged upstream. > > Speaking on behalf of my employer, we would love to see the data > modification-cookie feature implemented, whether in vfs or in xfs. > > *IF* the result on this thread is that the chosen solution is > ctime-with-change-counter in XFS > *AND* if there is agreement among XFS developers to extend it with > an opt-in mkfs/mount option to 100ns-mtime-with-change-counter in XFS > *THEN* I think I will be able to allocate resources to drive this xfs work. If it can be solved within XFS then this would be preferable.
On Thu, Oct 26, 2023 at 5:21 AM Dave Chinner <david@fromorbit.com> wrote: > > On Wed, Oct 25, 2023 at 08:25:35AM -0400, Jeff Layton wrote: > > On Wed, 2023-10-25 at 19:05 +1100, Dave Chinner wrote: > > > On Tue, Oct 24, 2023 at 02:40:06PM -0400, Jeff Layton wrote: > > > > On Tue, 2023-10-24 at 10:08 +0300, Amir Goldstein wrote: > > > > > On Tue, Oct 24, 2023 at 6:40 AM Dave Chinner <david@fromorbit.com> wrote: > > > > > > > > > > > > On Mon, Oct 23, 2023 at 02:18:12PM -1000, Linus Torvalds wrote: > > > > > > > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote: > > > > > Does xfs_repair guarantee that changes of atime, or any inode changes > > > > > for that matter, update i_version? No, it does not. > > > > > So IMO, "atime does not update i_version" is not an "on-disk format change", > > > > > it is a runtime behavior change, just like lazytime is. > > > > > > > > This would certainly be my preference. I don't want to break any > > > > existing users though. > > > > > > That's why I'm trying to get some kind of consensus on what > > > rules and/or atime configurations people are happy for me to break > > > to make it look to users like there's a viable working change > > > attribute being supplied by XFS without needing to change the on > > > disk format. > > > > > > > I agree that the only bone of contention is whether to count atime > > updates against the change attribute. I think we have consensus that all > > in-kernel users do _not_ want atime updates counted against the change > > attribute. The only real question is these "legacy" users of > > di_changecount. > > Please stop refering to "legacy users" of di_changecount. Whether > there are users or not is irrelevant - it is defined by the current > on-disk format specification, and as such there may be applications > we do not know about making use of the current behaviour. > > It's like a linux syscall - we can't remove them because there may > be some user we don't know about still using that old syscall. We > simply don't make changes that can potentially break user > applications like that. > > The on disk format is the same - there is software out that we don't > know about that expects a certain behaviour based on the > specification. We don't break the on disk format by making silent > behavioural changes - we require a feature flag to indicate > behaviour has changed so that applications can take appropriate > actions with stuff they don't understand. > > The example for this is the BIGTIME timestamp format change. The on > disk inode structure is physically unchanged, but the contents of > the timestamp fields are encoded very differently. Sure, the older > kernels can read the timestamp data without any sort of problem > occurring, except for the fact the timestamps now appear to be > completely corrupted. > > Changing the meaning of ithe contents of di_changecount is no > different. It might look OK and nothing crashes, but nothing can be > inferred from the value in the field because we don't know how it > has been modified. > I don't agree that this change is the same as BIGTIME change, but it is a good queue to ask: BIGTIME has an on-disk feature bit in super block that can be set on an existing filesystem (and not cleared?). BIGTIME also has an on-disk inode flag to specify the format in which a specific inode timestampts are stored. If we were to change the xfs on-disk to change the *meaning* (not the format that the counter is stored) of di_changecount, would the feature flag need be RO_COMPAT? Would this require a per-inode on-disk flag that declares the meaning of di_changecount on that specific inode? Neither of those changes is going to be very hard to do btw. Following the footsteps of the BIGTIME conversion, but without the need for an actual format convertors. Thanks, Amir.
On Wed, Oct 25, 2023 at 11:05 AM Dave Chinner <david@fromorbit.com> wrote: > > On Tue, Oct 24, 2023 at 02:40:06PM -0400, Jeff Layton wrote: > > On Tue, 2023-10-24 at 10:08 +0300, Amir Goldstein wrote: > > > On Tue, Oct 24, 2023 at 6:40 AM Dave Chinner <david@fromorbit.com> wrote: > > > > > > > > On Mon, Oct 23, 2023 at 02:18:12PM -1000, Linus Torvalds wrote: > > > > > On Mon, 23 Oct 2023 at 13:26, Dave Chinner <david@fromorbit.com> wrote: > > > > > > > > > > > > The problem is the first read request after a modification has been > > > > > > made. That is causing relatime to see mtime > atime and triggering > > > > > > an atime update. XFS sees this, does an atime update, and in > > > > > > committing that persistent inode metadata update, it calls > > > > > > inode_maybe_inc_iversion(force = false) to check if an iversion > > > > > > update is necessary. The VFS sees I_VERSION_QUERIED, and so it bumps > > > > > > i_version and tells XFS to persist it. > > > > > > > > > > Could we perhaps just have a mode where we don't increment i_version > > > > > for just atime updates? > > > > > > > > > > Maybe we don't even need a mode, and could just decide that atime > > > > > updates aren't i_version updates at all? > > > > > > > > We do that already - in memory atime updates don't bump i_version at > > > > all. The issue is the rare persistent atime update requests that > > > > still happen - they are the ones that trigger an i_version bump on > > > > XFS, and one of the relatime heuristics tickle this specific issue. > > > > > > > > If we push the problematic persistent atime updates to be in-memory > > > > updates only, then the whole problem with i_version goes away.... > > > > > > > > > Yes, yes, it's obviously technically a "inode modification", but does > > > > > anybody actually *want* atime updates with no actual other changes to > > > > > be version events? > > > > > > > > Well, yes, there was. That's why we defined i_version in the on disk > > > > format this way well over a decade ago. It was part of some deep > > > > dark magical HSM beans that allowed the application to combine > > > > multiple scans for different inode metadata changes into a single > > > > pass. atime changes was one of the things it needed to know about > > > > for tiering and space scavenging purposes.... > > > > > > > > > > But if this is such an ancient mystical program, why do we have to > > > keep this XFS behavior in the present? > > > BTW, is this the same HSM whose DMAPI ioctls were deprecated > > > a few years back? > > Drop the attitude, Amir. > > That "ancient mystical program" is this: > > https://buy.hpe.com/us/en/enterprise-solutions/high-performance-computing-solutions/high-performance-computing-storage-solutions/hpc-storage-solutions/hpe-data-management-framework-7/p/1010144088 > Sorry for the attitude Dave, I somehow got the impression that you were talking about a hypothetical old program that may be out of use. I believe that Jeff and Linus got the same impression... > Yup, that product is backed by a proprietary descendent of the Irix > XFS code base XFS that is DMAPI enabled and still in use today. It's > called HPE XFS these days.... > What do you mean? Do you mean that the HPE product uses patched XFS? If so, why is that an upstream concern? Upstream xfs indeed preserves di_dmstate,di_dmevmask, but it does not change those state members when file changes happen. So if mounting an HPE XFS disk on with upstream kernel is not going to record DMAPI state changes, does it matter if upstream xfs does not update di_changecount on atime change? Maybe I did not understand the situation w.r.t HPE XFS. > > > I mean, I understand that you do not want to change the behavior of > > > i_version update without an opt-in config or mount option - let the distro > > > make that choice. > > > But calling this an "on-disk format change" is a very long stretch. > > Telling the person who created, defined and implemented the on disk > format that they don't know what constitutes a change of that > on-disk format seems kinda Dunning-Kruger to me.... > OK. I will choose my words more carefully: I still do not understand, from everything that you have told us so far, including the mention of the specific product above, why not updating di_changecount on atime update constitutes an on-disk format change and not a runtime behavior change. You also did not address my comment that xfs_repair does not update di_changecount on any inode changes to the best of my code reading abilities. > There are *lots* of ways that di_changecount is now incompatible > with the VFS change counter. That's now defined as "i_version should > only change when [cm]time is changed". > > di_changecount is defined to be a count of the number of changes > made to the attributes of the inode. It's not just atime at issue > here - we bump di_changecount when make any inode change, including > background work that does not otherwise change timestamps. e.g. > allocation at writeback time, unwritten extent conversion, on-disk > EOF extension at IO completion, removal of speculative > pre-allocation beyond EOF, etc. > I see. Does xfs update ctime on all those inode block map changes? > IOWs, di_changecount was never defined as a linux "i_version" > counter, regardless of the fact we originally we able to implement > i_version with it - all extra bumps to di_changecount were not > important to the users of i_version for about a decade. > > Unfortunately, the new i_version definition is very much > incompatible with the existing di_changecount definition and that's > the underlying problem here. i.e. the problem is not that we bump > i_version on atime, it's that di_changecount is now completely > incompatible with the new i_version change semantics. > > To implement the new i_version semantics exactly, we need to add a > new field to the inode to hold this information. > If we change the on disk format like this, then the atime > problems go away because the new field would not get updated on > atime updates. We'd still be bumping di_changecount on atime > updates, though, because that's what is required by the on-disk > format. > I fully agree with you that we should avoid on-disk format change. This is exactly the reason that I'm insisting on the point of clarifying how exactly, this semantic change of di_changecount is going to break existing applications that run on upstream kernel. Thanks, Amir.
On Mon, 23 Oct 2023 at 17:40, Dave Chinner <david@fromorbit.com> wrote:
> >
> > Maybe we don't even need a mode, and could just decide that atime
> > updates aren't i_version updates at all?
>
> We do that already - in memory atime updates don't bump i_version at
> all. The issue is the rare persistent atime update requests that
> still happen - they are the ones that trigger an i_version bump on
> XFS, and one of the relatime heuristics tickle this specific issue.
Yes, yes, but that's what I kind of allude to - maybe people still
want the on-disk atime updates, but do they actually want the
i_version updates just because they want more aggressive atime
updates?
> > Or maybe i_version can update, but callers of getattr() could have two
> > bits for that STATX_CHANGE_COOKIE, one for "I care about atime" and
> > one for others, and we'd pass that down to inode_query_version, and
> > we'd have a I_VERSION_QUERIED and a I_VERSION_QUERIED_STRICT, and the
> > "I care about atime" case ould set the strict one.
>
> This makes correct behaviour reliant on the applicaiton using the
> query mechanism correctly. I have my doubts that userspace
> developers will be able to understand the subtle difference between
> the two options and always choose correctly....
I don't think we _have_ a user space interface.
At least the STATX_CHANGE_COOKIE bit isn't exposed to user space at
all. Not in the uapi headers, but not even in xstat():
/* STATX_CHANGE_COOKIE is kernel-only for now */
tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE;
So the *only* users of STATX_CHANGE_COOKIE seem to be entirely
in-kernel, unless there is something I'm missing where somebody uses
i_version through some other interface (random ioctl?).
End result: splitting STATX_CHANGE_COOKIE into a "I don't care about
atime" and a "give me all change events" shouldn't possibly break
anything that I can see.
The only other uses of inode_query_iversion() seem to be the explicit
directory optimizations (ie the "I'm caching the offset and don't want
to re-check that it's valid unless required, so give me the inode
version for the directory as a way to decide if I need to re-check"
thing).
And those *definitely* don't want i_version updates on any atime updates.
There might be some other use of inode_query_iversion() that I missed,
of course. But from a quick look, it really looks to me like we can
relax our i_version updates.
Linus
On Fri, 20 Oct 2023 at 05:12, Jeff Layton <jlayton@kernel.org> wrote:.
>
> I'd _really_ like to see a proper change counter added before it's
> merged, or at least space in the on-disk inode reserved for one until we
> can get it plumbed in.
Hmm. Can we not perhaps just do an in-memory change counter, and try
to initialize it to a random value when instantiating an inode? Do we
even *require* on-disk format changes?
So on reboot, the inode would count as "changed" as far any remote
user is concerned. It would flush client caches, but isn't that what
you'd want anyway? I'd hate to waste lots of memory, but maybe people
would be ok with just a 32-bit random value. And if not...
But I actually came into this whole discussion purely through the
inode timestamp side, so I may *entirely* miss what the change counter
requirements for NFSd actually are. If it needs to be stable across
reboots, my idea is clearly complete garbage.
You can now all jump on me and point out my severe intellectual
limitations. Please use small words when you do ;)
Linus
On Fri, 2023-10-20 at 13:06 -0700, Linus Torvalds wrote: > On Fri, 20 Oct 2023 at 05:12, Jeff Layton <jlayton@kernel.org> wrote:. > > > > I'd _really_ like to see a proper change counter added before it's > > merged, or at least space in the on-disk inode reserved for one until we > > can get it plumbed in. > > Hmm. Can we not perhaps just do an in-memory change counter, and try > to initialize it to a random value when instantiating an inode? Do we > even *require* on-disk format changes? > > So on reboot, the inode would count as "changed" as far any remote > user is concerned. It would flush client caches, but isn't that what > you'd want anyway? I'd hate to waste lots of memory, but maybe people > would be ok with just a 32-bit random value. And if not... > > But I actually came into this whole discussion purely through the > inode timestamp side, so I may *entirely* miss what the change counter > requirements for NFSd actually are. If it needs to be stable across > reboots, my idea is clearly complete garbage. > > You can now all jump on me and point out my severe intellectual > limitations. Please use small words when you do ;) > Much like inode timestamps, we do depend on the change attribute persisting across reboots. Having to invalidate all of your cached data just because the server rebooted is particularly awful. That usually results in the server being hammered with reads from all of the clients at once, soon after rebooting. -- Jeff Layton <jlayton@kernel.org>
On Fri, 20 Oct 2023 at 13:06, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> So on reboot, the inode would count as "changed" as far any remote
> user is concerned. [..]
Obviously, not just reboot would do that. Any kind of "it's no longer
cached on the server and gets read back from disk" would do the same
thing.
Again, that may not work for the intended purpose, but if the use-case
is a "same version number means no changes", it might be acceptable?
Even if you then could get spurious version changes when the file
hasn't been accessed in a long time?
Maybe all this together with with some ctime filtering ("old ctime
clealy means that the version number is irrelevant"). After all, the
whole point of fine-grained timestamps was to distinguish *frequent*
changes. An in-memory counter certainly does that even without any
on-disk representation..
Linus
© 2016 - 2025 Red Hat, Inc.