The addition of per message-id fastchannel support check exposed
a SCP firmware bug which leads to a device crash on X1E platforms.
The X1E firmware supports fastchannel on LEVEL_GET but fails to
have this set in the protocol message attributes and the fallback
to regular messaging leads to a device crash since that implementation
has a bug for all the X1E devices in the wild. Fix this by introducing
a quirk that selectively skips the per message-id fastchannel check only
for the LEVEL_GET message on X1E platforms.
Signed-off-by: Sibi Sankar <quic_sibis@quicinc.com>
---
drivers/firmware/arm_scmi/driver.c | 5 +++--
drivers/firmware/arm_scmi/perf.c | 30 +++++++++++++++++++++------
drivers/firmware/arm_scmi/powercap.c | 8 +++----
drivers/firmware/arm_scmi/protocols.h | 2 +-
4 files changed, 32 insertions(+), 13 deletions(-)
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 9313b9020fc1..b182fa8e8ccb 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -1903,7 +1903,8 @@ static void
scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
u8 describe_id, u32 message_id, u32 valid_size,
u32 domain, void __iomem **p_addr,
- struct scmi_fc_db_info **p_db, u32 *rate_limit)
+ struct scmi_fc_db_info **p_db, u32 *rate_limit,
+ bool skip_check)
{
int ret;
u32 flags;
@@ -1919,7 +1920,7 @@ scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
/* Check if the MSG_ID supports fastchannel */
ret = scmi_protocol_msg_check(ph, message_id, &attributes);
- if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes))
+ if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes) && !skip_check)
return;
if (!p_addr) {
diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index c7e5a34b254b..5b4559d0b054 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -48,6 +48,10 @@ enum {
PERF_FC_MAX,
};
+enum {
+ PERF_QUIRK_SKIP_FASTCHANNEL_LEVEL_GET,
+};
+
struct scmi_opp {
u32 perf;
u32 power;
@@ -183,6 +187,7 @@ struct scmi_perf_info {
enum scmi_power_scale power_scale;
u64 stats_addr;
u32 stats_size;
+ u32 quirks;
bool notify_lvl_cmd;
bool notify_lim_cmd;
struct perf_dom_info *dom_info;
@@ -838,9 +843,10 @@ static int scmi_perf_level_limits_notify(const struct scmi_protocol_handle *ph,
}
static void scmi_perf_domain_init_fc(const struct scmi_protocol_handle *ph,
- struct perf_dom_info *dom)
+ struct perf_dom_info *dom, u32 quirks)
{
struct scmi_fc_info *fc;
+ bool quirk_level_get = !!(quirks & BIT(PERF_QUIRK_SKIP_FASTCHANNEL_LEVEL_GET));
fc = devm_kcalloc(ph->dev, PERF_FC_MAX, sizeof(*fc), GFP_KERNEL);
if (!fc)
@@ -849,26 +855,26 @@ static void scmi_perf_domain_init_fc(const struct scmi_protocol_handle *ph,
ph->hops->fastchannel_init(ph, PERF_DESCRIBE_FASTCHANNEL,
PERF_LEVEL_GET, 4, dom->id,
&fc[PERF_FC_LEVEL].get_addr, NULL,
- &fc[PERF_FC_LEVEL].rate_limit);
+ &fc[PERF_FC_LEVEL].rate_limit, quirk_level_get);
ph->hops->fastchannel_init(ph, PERF_DESCRIBE_FASTCHANNEL,
PERF_LIMITS_GET, 8, dom->id,
&fc[PERF_FC_LIMIT].get_addr, NULL,
- &fc[PERF_FC_LIMIT].rate_limit);
+ &fc[PERF_FC_LIMIT].rate_limit, false);
if (dom->info.set_perf)
ph->hops->fastchannel_init(ph, PERF_DESCRIBE_FASTCHANNEL,
PERF_LEVEL_SET, 4, dom->id,
&fc[PERF_FC_LEVEL].set_addr,
&fc[PERF_FC_LEVEL].set_db,
- &fc[PERF_FC_LEVEL].rate_limit);
+ &fc[PERF_FC_LEVEL].rate_limit, false);
if (dom->set_limits)
ph->hops->fastchannel_init(ph, PERF_DESCRIBE_FASTCHANNEL,
PERF_LIMITS_SET, 8, dom->id,
&fc[PERF_FC_LIMIT].set_addr,
&fc[PERF_FC_LIMIT].set_db,
- &fc[PERF_FC_LIMIT].rate_limit);
+ &fc[PERF_FC_LIMIT].rate_limit, false);
dom->fc_info = fc;
}
@@ -1282,6 +1288,7 @@ static int scmi_perf_protocol_init(const struct scmi_protocol_handle *ph)
{
int domain, ret;
u32 version;
+ struct device_node *np;
struct scmi_perf_info *pinfo;
ret = ph->xops->version_get(ph, &version);
@@ -1297,6 +1304,17 @@ static int scmi_perf_protocol_init(const struct scmi_protocol_handle *ph)
pinfo->version = version;
+ /*
+ * Some X1E devices support fastchannel for LEVEL_GET but erroneously
+ * says otherwise in the protocol message attributes. Add a quirk to
+ * force fastchannel on LEVEL_GET to prevent crashes on such devices.
+ */
+ np = of_find_compatible_node(NULL, NULL, "qcom,x1e80100");
+ if (np) {
+ pinfo->quirks = BIT(PERF_QUIRK_SKIP_FASTCHANNEL_LEVEL_GET);
+ of_node_put(np);
+ }
+
ret = scmi_perf_attributes_get(ph, pinfo);
if (ret)
return ret;
@@ -1315,7 +1333,7 @@ static int scmi_perf_protocol_init(const struct scmi_protocol_handle *ph)
scmi_perf_describe_levels_get(ph, dom, version);
if (dom->perf_fastchannels)
- scmi_perf_domain_init_fc(ph, dom);
+ scmi_perf_domain_init_fc(ph, dom, pinfo->quirks);
}
ret = devm_add_action_or_reset(ph->dev, scmi_perf_xa_destroy, pinfo);
diff --git a/drivers/firmware/arm_scmi/powercap.c b/drivers/firmware/arm_scmi/powercap.c
index 1fa79bba492e..90e92fc432e3 100644
--- a/drivers/firmware/arm_scmi/powercap.c
+++ b/drivers/firmware/arm_scmi/powercap.c
@@ -720,23 +720,23 @@ static void scmi_powercap_domain_init_fc(const struct scmi_protocol_handle *ph,
POWERCAP_CAP_SET, 4, domain,
&fc[POWERCAP_FC_CAP].set_addr,
&fc[POWERCAP_FC_CAP].set_db,
- &fc[POWERCAP_FC_CAP].rate_limit);
+ &fc[POWERCAP_FC_CAP].rate_limit, false);
ph->hops->fastchannel_init(ph, POWERCAP_DESCRIBE_FASTCHANNEL,
POWERCAP_CAP_GET, 4, domain,
&fc[POWERCAP_FC_CAP].get_addr, NULL,
- &fc[POWERCAP_FC_CAP].rate_limit);
+ &fc[POWERCAP_FC_CAP].rate_limit, false);
ph->hops->fastchannel_init(ph, POWERCAP_DESCRIBE_FASTCHANNEL,
POWERCAP_PAI_SET, 4, domain,
&fc[POWERCAP_FC_PAI].set_addr,
&fc[POWERCAP_FC_PAI].set_db,
- &fc[POWERCAP_FC_PAI].rate_limit);
+ &fc[POWERCAP_FC_PAI].rate_limit, false);
ph->hops->fastchannel_init(ph, POWERCAP_DESCRIBE_FASTCHANNEL,
POWERCAP_PAI_GET, 4, domain,
&fc[POWERCAP_FC_PAI].get_addr, NULL,
- &fc[POWERCAP_FC_PAI].rate_limit);
+ &fc[POWERCAP_FC_PAI].rate_limit, false);
*p_fc = fc;
}
diff --git a/drivers/firmware/arm_scmi/protocols.h b/drivers/firmware/arm_scmi/protocols.h
index d62c4469d1fd..fe4c724caba3 100644
--- a/drivers/firmware/arm_scmi/protocols.h
+++ b/drivers/firmware/arm_scmi/protocols.h
@@ -280,7 +280,7 @@ struct scmi_proto_helpers_ops {
u32 valid_size, u32 domain,
void __iomem **p_addr,
struct scmi_fc_db_info **p_db,
- u32 *rate_limit);
+ u32 *rate_limit, bool skip_check);
void (*fastchannel_db_ring)(struct scmi_fc_db_info *db);
int (*get_max_msg_size)(const struct scmi_protocol_handle *ph);
};
--
2.34.1
On Wed, Feb 26, 2025 at 08:13:38AM +0530, Sibi Sankar wrote:
> The addition of per message-id fastchannel support check exposed
> a SCP firmware bug which leads to a device crash on X1E platforms.
> The X1E firmware supports fastchannel on LEVEL_GET but fails to
> have this set in the protocol message attributes and the fallback
> to regular messaging leads to a device crash since that implementation
> has a bug for all the X1E devices in the wild. Fix this by introducing
> a quirk that selectively skips the per message-id fastchannel check only
> for the LEVEL_GET message on X1E platforms.
>
> Signed-off-by: Sibi Sankar <quic_sibis@quicinc.com>
> ---
> drivers/firmware/arm_scmi/driver.c | 5 +++--
> drivers/firmware/arm_scmi/perf.c | 30 +++++++++++++++++++++------
> drivers/firmware/arm_scmi/powercap.c | 8 +++----
> drivers/firmware/arm_scmi/protocols.h | 2 +-
> 4 files changed, 32 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
> index 9313b9020fc1..b182fa8e8ccb 100644
> --- a/drivers/firmware/arm_scmi/driver.c
> +++ b/drivers/firmware/arm_scmi/driver.c
> @@ -1903,7 +1903,8 @@ static void
> scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> u8 describe_id, u32 message_id, u32 valid_size,
> u32 domain, void __iomem **p_addr,
> - struct scmi_fc_db_info **p_db, u32 *rate_limit)
> + struct scmi_fc_db_info **p_db, u32 *rate_limit,
> + bool skip_check)
This does not look like it will scale.
> {
> int ret;
> u32 flags;
> @@ -1919,7 +1920,7 @@ scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
>
> /* Check if the MSG_ID supports fastchannel */
> ret = scmi_protocol_msg_check(ph, message_id, &attributes);
> - if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes))
> + if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes) && !skip_check)
Why can't you just make sure that the bit is set in attributes as I
suggested? That seems like it should allow for a minimal implementation
of this.
> return;
>
> if (!p_addr) {
> @@ -1282,6 +1288,7 @@ static int scmi_perf_protocol_init(const struct scmi_protocol_handle *ph)
> {
> int domain, ret;
> u32 version;
> + struct device_node *np;
> struct scmi_perf_info *pinfo;
>
> ret = ph->xops->version_get(ph, &version);
> @@ -1297,6 +1304,17 @@ static int scmi_perf_protocol_init(const struct scmi_protocol_handle *ph)
>
> pinfo->version = version;
>
> + /*
> + * Some X1E devices support fastchannel for LEVEL_GET but erroneously
> + * says otherwise in the protocol message attributes. Add a quirk to
> + * force fastchannel on LEVEL_GET to prevent crashes on such devices.
> + */
> + np = of_find_compatible_node(NULL, NULL, "qcom,x1e80100");
Here you should use of_machine_is_compatible().
> + if (np) {
> + pinfo->quirks = BIT(PERF_QUIRK_SKIP_FASTCHANNEL_LEVEL_GET);
> + of_node_put(np);
> + }
> +
> ret = scmi_perf_attributes_get(ph, pinfo);
> if (ret)
> return ret;
Johan
On Wed, Feb 26, 2025 at 09:12:23AM +0100, Johan Hovold wrote:
> On Wed, Feb 26, 2025 at 08:13:38AM +0530, Sibi Sankar wrote:
> > scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> > u8 describe_id, u32 message_id, u32 valid_size,
> > u32 domain, void __iomem **p_addr,
> > - struct scmi_fc_db_info **p_db, u32 *rate_limit)
> > + struct scmi_fc_db_info **p_db, u32 *rate_limit,
> > + bool skip_check)
>
> This does not look like it will scale.
After taking a closer look, perhaps it needs to be done along these
lines.
But calling the parameter 'force' or similar as Dan suggested should
make it more readable.
>
> > {
> > int ret;
> > u32 flags;
> > @@ -1919,7 +1920,7 @@ scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> >
> > /* Check if the MSG_ID supports fastchannel */
> > ret = scmi_protocol_msg_check(ph, message_id, &attributes);
> > - if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes))
> > + if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes) && !skip_check)
>
> Why can't you just make sure that the bit is set in attributes as I
> suggested? That seems like it should allow for a minimal implementation
> of this.
My idea here was that you could come up with some way of abstracting
this so that you did not have to update every call site. Not sure how
feasible that is.
> > return;
> >
> > if (!p_addr) {
Johan
On Wed, Feb 26, 2025 at 09:55:21AM +0100, Johan Hovold wrote:
> On Wed, Feb 26, 2025 at 09:12:23AM +0100, Johan Hovold wrote:
> > On Wed, Feb 26, 2025 at 08:13:38AM +0530, Sibi Sankar wrote:
>
> > > scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> > > u8 describe_id, u32 message_id, u32 valid_size,
> > > u32 domain, void __iomem **p_addr,
> > > - struct scmi_fc_db_info **p_db, u32 *rate_limit)
> > > + struct scmi_fc_db_info **p_db, u32 *rate_limit,
> > > + bool skip_check)
> >
> > This does not look like it will scale.
>
> After taking a closer look, perhaps it needs to be done along these
> lines.
>
> But calling the parameter 'force' or similar as Dan suggested should
> make it more readable.
>
> >
> > > {
> > > int ret;
> > > u32 flags;
> > > @@ -1919,7 +1920,7 @@ scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> > >
> > > /* Check if the MSG_ID supports fastchannel */
> > > ret = scmi_protocol_msg_check(ph, message_id, &attributes);
> > > - if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes))
> > > + if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes) && !skip_check)
> >
> > Why can't you just make sure that the bit is set in attributes as I
> > suggested? That seems like it should allow for a minimal implementation
> > of this.
>
> My idea here was that you could come up with some way of abstracting
> this so that you did not have to update every call site. Not sure how
> feasible that is.
>
I'm having a hard time finding your email.
Why does the scmi_proto_helpers_ops struct even exist? We could just
call all these functions directly. Do we have plans to actually create
different implementations?
If we got rid of the scmi_proto_helpers_ops struct then we could just
rename scmi_common_fastchannel_init() to __scmi_common_fastchannel_init()
and create a default wrapper around it and a _forced() wrapper.
Some other potentially stupid ideas in the spirit of brainstorming are
that we could add a quirks parameter which takes a flag instead of a
bool. Or we could add a quirks flag to the scmi_protocol_handle struct.
regards,
dan carpenter
On Wed, Feb 26, 2025 at 12:31:27PM +0300, Dan Carpenter wrote:
> On Wed, Feb 26, 2025 at 09:55:21AM +0100, Johan Hovold wrote:
> > On Wed, Feb 26, 2025 at 09:12:23AM +0100, Johan Hovold wrote:
> > > On Wed, Feb 26, 2025 at 08:13:38AM +0530, Sibi Sankar wrote:
> >
> > > > scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> > > > u8 describe_id, u32 message_id, u32 valid_size,
> > > > u32 domain, void __iomem **p_addr,
> > > > - struct scmi_fc_db_info **p_db, u32 *rate_limit)
> > > > + struct scmi_fc_db_info **p_db, u32 *rate_limit,
> > > > + bool skip_check)
> > >
> > > This does not look like it will scale.
> >
> > After taking a closer look, perhaps it needs to be done along these
> > lines.
> >
> > But calling the parameter 'force' or similar as Dan suggested should
> > make it more readable.
> >
> > >
> > > > {
> > > > int ret;
> > > > u32 flags;
> > > > @@ -1919,7 +1920,7 @@ scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> > > >
> > > > /* Check if the MSG_ID supports fastchannel */
> > > > ret = scmi_protocol_msg_check(ph, message_id, &attributes);
> > > > - if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes))
> > > > + if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes) && !skip_check)
> > >
> > > Why can't you just make sure that the bit is set in attributes as I
> > > suggested? That seems like it should allow for a minimal implementation
> > > of this.
> >
> > My idea here was that you could come up with some way of abstracting
> > this so that you did not have to update every call site. Not sure how
> > feasible that is.
> >
>
> I'm having a hard time finding your email.
>
> Why does the scmi_proto_helpers_ops struct even exist? We could just
> call all these functions directly. Do we have plans to actually create
> different implementations?
>
> If we got rid of the scmi_proto_helpers_ops struct then we could just
> rename scmi_common_fastchannel_init() to __scmi_common_fastchannel_init()
> and create a default wrapper around it and a _forced() wrapper.
These are common helpers for all the protocols to avoid endlessly
repeating the same code pattern (like fc_init or iterators) and they are
laid out this way to be usable by any protocol even if you compile it as
a module WITHOUT the need of EXPORTing any of these symbols....same
reason why all protocol ops exposed in scmi_protocol.h are defined to be
accessed using an instance handle....trying to limit as much as possible
EXPORTing symbols beside the basic needs...specially because SCMI
specifes a protocol that is extensible by design both for standard and
vendor protocols, so it would soon became a hell to maintain such an
API..I mean just look at scmi_protocol.h...these helpers are made
accessible this way for the same reasons...same for the xops
(beside also not being overlying appreciated in the Android world to
ass an excessive number of EXPORTed syms)
Thanks,
Cristian
On Wed, Feb 26, 2025 at 12:31:27PM +0300, Dan Carpenter wrote:
> On Wed, Feb 26, 2025 at 09:55:21AM +0100, Johan Hovold wrote:
> > On Wed, Feb 26, 2025 at 09:12:23AM +0100, Johan Hovold wrote:
> > > On Wed, Feb 26, 2025 at 08:13:38AM +0530, Sibi Sankar wrote:
> >
> > > > scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> > > > u8 describe_id, u32 message_id, u32 valid_size,
> > > > u32 domain, void __iomem **p_addr,
> > > > - struct scmi_fc_db_info **p_db, u32 *rate_limit)
> > > > + struct scmi_fc_db_info **p_db, u32 *rate_limit,
> > > > + bool skip_check)
> > >
> > > This does not look like it will scale.
> >
> > After taking a closer look, perhaps it needs to be done along these
> > lines.
> >
> > But calling the parameter 'force' or similar as Dan suggested should
> > make it more readable.
> >
> > > > {
> > > > int ret;
> > > > u32 flags;
> > > > @@ -1919,7 +1920,7 @@ scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> > > >
> > > > /* Check if the MSG_ID supports fastchannel */
> > > > ret = scmi_protocol_msg_check(ph, message_id, &attributes);
> > > > - if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes))
> > > > + if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes) && !skip_check)
> > >
> > > Why can't you just make sure that the bit is set in attributes as I
> > > suggested? That seems like it should allow for a minimal implementation
> > > of this.
> >
> > My idea here was that you could come up with some way of abstracting
> > this so that you did not have to update every call site. Not sure how
> > feasible that is.
>
> I'm having a hard time finding your email.
https://lore.kernel.org/lkml/Z4Dt8E7C6upVtEGV@hovoldconsulting.com/
> Why does the scmi_proto_helpers_ops struct even exist? We could just
> call all these functions directly. Do we have plans to actually create
> different implementations?
>
> If we got rid of the scmi_proto_helpers_ops struct then we could just
> rename scmi_common_fastchannel_init() to __scmi_common_fastchannel_init()
> and create a default wrapper around it and a _forced() wrapper.
>
> Some other potentially stupid ideas in the spirit of brainstorming are
> that we could add a quirks parameter which takes a flag instead of a
> bool. Or we could add a quirks flag to the scmi_protocol_handle struct.
Something like that, yes. :) I didn't try to implement it, but it seems
like it should be possible implement this is a way that keeps the quirk
handling isolated.
Johan
On Wed, Feb 26, 2025 at 10:58:44AM +0100, Johan Hovold wrote:
> On Wed, Feb 26, 2025 at 12:31:27PM +0300, Dan Carpenter wrote:
> > On Wed, Feb 26, 2025 at 09:55:21AM +0100, Johan Hovold wrote:
> > > On Wed, Feb 26, 2025 at 09:12:23AM +0100, Johan Hovold wrote:
> > > > On Wed, Feb 26, 2025 at 08:13:38AM +0530, Sibi Sankar wrote:
> > >
> > > > > scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> > > > > u8 describe_id, u32 message_id, u32 valid_size,
> > > > > u32 domain, void __iomem **p_addr,
> > > > > - struct scmi_fc_db_info **p_db, u32 *rate_limit)
> > > > > + struct scmi_fc_db_info **p_db, u32 *rate_limit,
> > > > > + bool skip_check)
> > > >
> > > > This does not look like it will scale.
> > >
Hi,
> > > After taking a closer look, perhaps it needs to be done along these
> > > lines.
> > >
> > > But calling the parameter 'force' or similar as Dan suggested should
> > > make it more readable.
> > >
> > > > > {
> > > > > int ret;
> > > > > u32 flags;
> > > > > @@ -1919,7 +1920,7 @@ scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> > > > >
> > > > > /* Check if the MSG_ID supports fastchannel */
> > > > > ret = scmi_protocol_msg_check(ph, message_id, &attributes);
> > > > > - if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes))
> > > > > + if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes) && !skip_check)
> > > >
> > > > Why can't you just make sure that the bit is set in attributes as I
> > > > suggested? That seems like it should allow for a minimal implementation
> > > > of this.
> > >
> > > My idea here was that you could come up with some way of abstracting
> > > this so that you did not have to update every call site. Not sure how
> > > feasible that is.
> >
> > I'm having a hard time finding your email.
>
> https://lore.kernel.org/lkml/Z4Dt8E7C6upVtEGV@hovoldconsulting.com/
>
> > Why does the scmi_proto_helpers_ops struct even exist? We could just
> > call all these functions directly. Do we have plans to actually create
> > different implementations?
> >
> > If we got rid of the scmi_proto_helpers_ops struct then we could just
> > rename scmi_common_fastchannel_init() to __scmi_common_fastchannel_init()
> > and create a default wrapper around it and a _forced() wrapper.
> >
> > Some other potentially stupid ideas in the spirit of brainstorming are
> > that we could add a quirks parameter which takes a flag instead of a
> > bool. Or we could add a quirks flag to the scmi_protocol_handle struct.
>
> Something like that, yes. :) I didn't try to implement it, but it seems
> like it should be possible implement this is a way that keeps the quirk
> handling isolated.
>
I hope next week to have a better look at this, in tne meantime just a
few considerations....
Sooner or later we should have introduced some sort of quirk framework
in SCMI to deal systematically with potentially out-of-spec FW, but as
in the name, it should be some sort of framework where you have a table of
quirks, related activation conditions and a few very well isolated points
where the quirks are placed and take action if enabled...this does not
seem the case here where instead an ad-hoc param is added to the function
that needs to be quirked...this does not scale and will make the codebase
a mess IMHO...
Last but not least, these quirks 'activations' in the SCMI world should
be driven mainly by the VENDOR/SUB-VENDOR/IMPLEMENTATION_VERS triple and
only as a last resort by a platform compatible match...because the
IMPLEMENTATION_VERSION, exposed by the FW and gathered via SCMI Base
protocol, is completely under the vendor control so it can, and should, be
used to identify broken FW-versions...indeed all the distinct SCMI protocols
are anyway versioned elsewhere according to the spec, so there is no need to
repeat SCMI protocol version here..I mean it is an IMPLEMENTATION version
As an example on a JUNO the SCP reference FW reports:
arm-scmi arm-scmi.1.auto: SCMI Protocol v2.0 'arm:arm' Firmware version 0x20f0000
where the FW version represent something like the FW release tag, I think...not
really sure about the menaing our FW gys give to it, BUT definitely it is not
a bare copy of the SCMI protocol version...
So...
...does the platfom-to-be-quirked at hand properly use the IMPLEMENTATION_VERSION
flag in a sensible way ?
IOW does that change between a bad and good (or less bad :D) version ?
Because shooting with the platform 'compatible-gun' should be the last resort
if all of the above does NOT happen in this beloved fw...
Anyway, after all of this babbling, I know, talk is cheap :D...so now I will shut
up and see if I can prototype something generic to deal with quirks, possibly
next week...
Thanks,
Cristian
Hi Cristian, On Thu, Feb 27, 2025 at 08:34:44AM +0000, Cristian Marussi wrote: > On Wed, Feb 26, 2025 at 10:58:44AM +0100, Johan Hovold wrote: > > Something like that, yes. :) I didn't try to implement it, but it seems > > like it should be possible implement this is a way that keeps the quirk > > handling isolated. > > I hope next week to have a better look at this, in tne meantime just a > few considerations.... > > Sooner or later we should have introduced some sort of quirk framework > in SCMI to deal systematically with potentially out-of-spec FW, but as > in the name, it should be some sort of framework where you have a table of > quirks, related activation conditions and a few very well isolated points > where the quirks are placed and take action if enabled...this does not > seem the case here where instead an ad-hoc param is added to the function > that needs to be quirked...this does not scale and will make the codebase > a mess IMHO... Sounds good. At least we have a good understanding now of how this particular firmware is broken so it would be great if you could use this as a test case for the implementation. In summary, we need to force the use of a fast channel for PERF_LEVEL_GET on these machines, or possibly fall back to the current behaviour of only using the domain attribute to determine whether the fast channels should be initialised. The latter may allow for a less intrusive implementation even if we'd still see: arm-scmi arm-scmi.0.auto: Failed to get FC for protocol 13 [MSG_ID:6 / RES_ID:0] - ret:-95. Using regular messaging. arm-scmi arm-scmi.0.auto: Failed to get FC for protocol 13 [MSG_ID:6 / RES_ID:1] - ret:-95. Using regular messaging. arm-scmi arm-scmi.0.auto: Failed to get FC for protocol 13 [MSG_ID:6 / RES_ID:2] - ret:-95. Using regular messaging. when not supported for all messages (e.g. with the current firmware). > Last but not least, these quirks 'activations' in the SCMI world should > be driven mainly by the VENDOR/SUB-VENDOR/IMPLEMENTATION_VERS triple and > only as a last resort by a platform compatible match...because the > IMPLEMENTATION_VERSION, exposed by the FW and gathered via SCMI Base > protocol, is completely under the vendor control so it can, and should, be > used to identify broken FW-versions...indeed all the distinct SCMI protocols > are anyway versioned elsewhere according to the spec, so there is no need to > repeat SCMI protocol version here..I mean it is an IMPLEMENTATION version > > As an example on a JUNO the SCP reference FW reports: > > arm-scmi arm-scmi.1.auto: SCMI Protocol v2.0 'arm:arm' Firmware version 0x20f0000 > > where the FW version represent something like the FW release tag, I think...not > really sure about the menaing our FW gys give to it, BUT definitely it is not > a bare copy of the SCMI protocol version... > > So... > ...does the platfom-to-be-quirked at hand properly use the IMPLEMENTATION_VERSION > flag in a sensible way ? > IOW does that change between a bad and good (or less bad :D) version ? I guess only Sibi and Qualcomm can answer that. Both machines I have that suffer from this report: arm-scmi arm-scmi.0.auto: SCMI Protocol v2.0 'Qualcomm:' Firmware version 0x20000 and I'm not sure if any fixed firmware has made it out to the vendors yet or if the version was bumped when this was fixed. On the other hand, perhaps forcing fast channel initialisation for PERF_LEVEL_GET on all 'Qualcomm' firmware would work (i.e. only based on VENDOR). > Because shooting with the platform 'compatible-gun' should be the last resort > if all of the above does NOT happen in this beloved fw... > > Anyway, after all of this babbling, I know, talk is cheap :D...so now I will shut > up and see if I can prototype something generic to deal with quirks, possibly > next week... Much appreciated, thanks. Johan
Hi Cristian, On Mon, Mar 03, 2025 at 11:53:52AM +0100, Johan Hovold wrote: > On Thu, Feb 27, 2025 at 08:34:44AM +0000, Cristian Marussi wrote: > > On Wed, Feb 26, 2025 at 10:58:44AM +0100, Johan Hovold wrote: > > > > Something like that, yes. :) I didn't try to implement it, but it seems > > > like it should be possible implement this is a way that keeps the quirk > > > handling isolated. > > > > I hope next week to have a better look at this, in tne meantime just a > > few considerations.... > > > > Sooner or later we should have introduced some sort of quirk framework > > in SCMI to deal systematically with potentially out-of-spec FW, but as > > in the name, it should be some sort of framework where you have a table of > > quirks, related activation conditions and a few very well isolated points > > where the quirks are placed and take action if enabled...this does not > > seem the case here where instead an ad-hoc param is added to the function > > that needs to be quirked...this does not scale and will make the codebase > > a mess IMHO... > > Sounds good. At least we have a good understanding now of how this > particular firmware is broken so it would be great if you could use > this as a test case for the implementation. > > In summary, we need to force the use of a fast channel for > PERF_LEVEL_GET on these machines, or possibly fall back to the current > behaviour of only using the domain attribute to determine whether the > fast channels should be initialised. > > The latter may allow for a less intrusive implementation even if we'd > still see: > > arm-scmi arm-scmi.0.auto: Failed to get FC for protocol 13 [MSG_ID:6 / RES_ID:0] - ret:-95. Using regular messaging. > arm-scmi arm-scmi.0.auto: Failed to get FC for protocol 13 [MSG_ID:6 / RES_ID:1] - ret:-95. Using regular messaging. > arm-scmi arm-scmi.0.auto: Failed to get FC for protocol 13 [MSG_ID:6 / RES_ID:2] - ret:-95. Using regular messaging. > > when not supported for all messages (e.g. with the current firmware). > > Anyway, after all of this babbling, I know, talk is cheap :D...so now I will shut > > up and see if I can prototype something generic to deal with quirks, possibly > > next week... Have you made any progress on the quirk framework prototyping? Do you need any input from Sibi on the protocol versioning for that? We'd really like to enable cpufreq on this platform and ideally in 6.15. I think that should be possible given that we now understand in what ways the firmware is broken and what is needed to handle it even if we still need to decide on how best to implement this. Johan
On Tue, Mar 18, 2025 at 09:16:36AM +0100, Johan Hovold wrote: > Hi Cristian, Hi Johan, > > On Mon, Mar 03, 2025 at 11:53:52AM +0100, Johan Hovold wrote: > > On Thu, Feb 27, 2025 at 08:34:44AM +0000, Cristian Marussi wrote: > > > On Wed, Feb 26, 2025 at 10:58:44AM +0100, Johan Hovold wrote: > > > > > > Something like that, yes. :) I didn't try to implement it, but it seems > > > > like it should be possible implement this is a way that keeps the quirk > > > > handling isolated. > > > > > > I hope next week to have a better look at this, in tne meantime just a > > > few considerations.... > > > > > > Sooner or later we should have introduced some sort of quirk framework > > > in SCMI to deal systematically with potentially out-of-spec FW, but as > > > in the name, it should be some sort of framework where you have a table of > > > quirks, related activation conditions and a few very well isolated points > > > where the quirks are placed and take action if enabled...this does not > > > seem the case here where instead an ad-hoc param is added to the function > > > that needs to be quirked...this does not scale and will make the codebase > > > a mess IMHO... > > > > Sounds good. At least we have a good understanding now of how this > > particular firmware is broken so it would be great if you could use > > this as a test case for the implementation. > > > > In summary, we need to force the use of a fast channel for > > PERF_LEVEL_GET on these machines, or possibly fall back to the current > > behaviour of only using the domain attribute to determine whether the > > fast channels should be initialised. > > > > The latter may allow for a less intrusive implementation even if we'd > > still see: > > > > arm-scmi arm-scmi.0.auto: Failed to get FC for protocol 13 [MSG_ID:6 / RES_ID:0] - ret:-95. Using regular messaging. > > arm-scmi arm-scmi.0.auto: Failed to get FC for protocol 13 [MSG_ID:6 / RES_ID:1] - ret:-95. Using regular messaging. > > arm-scmi arm-scmi.0.auto: Failed to get FC for protocol 13 [MSG_ID:6 / RES_ID:2] - ret:-95. Using regular messaging. > > > > when not supported for all messages (e.g. with the current firmware). > > > > Anyway, after all of this babbling, I know, talk is cheap :D...so now I will shut > > > up and see if I can prototype something generic to deal with quirks, possibly > > > next week... > > Have you made any progress on the quirk framework prototyping? > I have not forgot, tried a few things, but nothing really to post as of now...dont wnat to rush either .... I was hoping to push something out at the end of this next merge window... > Do you need any input from Sibi on the protocol versioning for that? > No I am fine, I am planning anyway for something generic enough to be easy then to plug your own quirks separately... > We'd really like to enable cpufreq on this platform and ideally in 6.15. > I think that should be possible given that we now understand in what > ways the firmware is broken and what is needed to handle it even if we > still need to decide on how best to implement this. > v6.15 seems hard/impossible even using the original Sibi patch given the usual upstreaming-timeline of the SCMI stack where everything has to be usually reviewed and accepted by rc4/rc5.....so both Sibi initial patch and my own babbling were alreaady sort of late. Thanks, Cristian
On Tue, Mar 18, 2025 at 01:29:19PM +0000, Cristian Marussi wrote: > On Tue, Mar 18, 2025 at 09:16:36AM +0100, Johan Hovold wrote: > > Have you made any progress on the quirk framework prototyping? > > I have not forgot, tried a few things, but nothing really to post as of > now...dont wnat to rush either .... I was hoping to push something out at > the end of this next merge window... > > > Do you need any input from Sibi on the protocol versioning for that? > > No I am fine, I am planning anyway for something generic enough to be > easy then to plug your own quirks separately... Sounds good, thanks. > > We'd really like to enable cpufreq on this platform and ideally in 6.15. > > I think that should be possible given that we now understand in what > > ways the firmware is broken and what is needed to handle it even if we > > still need to decide on how best to implement this. > > v6.15 seems hard/impossible even using the original Sibi patch > given the usual upstreaming-timeline of the SCMI stack where everything has > to be usually reviewed and accepted by rc4/rc5.....so both Sibi initial > patch and my own babbling were alreaady sort of late. Yes, sorry, I wasn't referring to Sibi's SCMI patches, but the devicetree changes needed to enable cpufreq on these platforms (that will go through the qcom tree). It may be a little late for that too, but with an understanding of the problem and a quirk implementation around the corner, we could enable cpufreq as long as we make sure that Sibi's per-channel FC fix (that addresses the FC warnings) is not merged without the quirk in place (to avoid the crashes). Johan
On Wed, Feb 26, 2025 at 08:13:38AM +0530, Sibi Sankar wrote:
> The addition of per message-id fastchannel support check exposed
> a SCP firmware bug which leads to a device crash on X1E platforms.
You're fixing a bug that is introduced in patch 1. That's not allowed.
These need to be squashed into one patch. I means the patch will be
slightly long and the commit message will be slightly long but I
feel like it's manageable.
> The X1E firmware supports fastchannel on LEVEL_GET but fails to
> have this set in the protocol message attributes and the fallback
> to regular messaging leads to a device crash since that implementation
> has a bug for all the X1E devices in the wild. Fix this by introducing
> a quirk that selectively skips the per message-id fastchannel check only
> for the LEVEL_GET message on X1E platforms.
>
> Signed-off-by: Sibi Sankar <quic_sibis@quicinc.com>
> ---
> drivers/firmware/arm_scmi/driver.c | 5 +++--
> drivers/firmware/arm_scmi/perf.c | 30 +++++++++++++++++++++------
> drivers/firmware/arm_scmi/powercap.c | 8 +++----
> drivers/firmware/arm_scmi/protocols.h | 2 +-
> 4 files changed, 32 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
> index 9313b9020fc1..b182fa8e8ccb 100644
> --- a/drivers/firmware/arm_scmi/driver.c
> +++ b/drivers/firmware/arm_scmi/driver.c
> @@ -1903,7 +1903,8 @@ static void
> scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
> u8 describe_id, u32 message_id, u32 valid_size,
> u32 domain, void __iomem **p_addr,
> - struct scmi_fc_db_info **p_db, u32 *rate_limit)
> + struct scmi_fc_db_info **p_db, u32 *rate_limit,
> + bool skip_check)
> {
> int ret;
> u32 flags;
> @@ -1919,7 +1920,7 @@ scmi_common_fastchannel_init(const struct scmi_protocol_handle *ph,
>
> /* Check if the MSG_ID supports fastchannel */
> ret = scmi_protocol_msg_check(ph, message_id, &attributes);
> - if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes))
> + if (!ret && !MSG_SUPPORTS_FASTCHANNEL(attributes) && !skip_check)
> return;
This is explained well in the commit message but the comment here needs
to be better. Also if scmi_protocol_msg_check() fails then we should
return. Let's rename the variable to "force_fastchannel".
ret = scmi_protocol_msg_check(ph, message_id, &attributes);
if (ret)
return;
/*
* Check if the MSG_ID supports fastchannel. The force_fastchannel
* quirk is necessary to work around a firmware bug. We can probably
* remove that quirk in 2030.
*/
if (!force_fastchannel && !MSG_SUPPORTS_FASTCHANNEL(attributes))
return;
>
> if (!p_addr) {
> diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
> index c7e5a34b254b..5b4559d0b054 100644
> --- a/drivers/firmware/arm_scmi/perf.c
> +++ b/drivers/firmware/arm_scmi/perf.c
> @@ -48,6 +48,10 @@ enum {
> PERF_FC_MAX,
> };
>
> +enum {
> + PERF_QUIRK_SKIP_FASTCHANNEL_LEVEL_GET,
> +};
Let's keep the FORCE_FASTCHANNEL naming. Normally we would do this like:
#define PERF_QUIRK_FORCE_FASTCHANNEL BIT(0)
> +
> struct scmi_opp {
> u32 perf;
> u32 power;
> @@ -183,6 +187,7 @@ struct scmi_perf_info {
> enum scmi_power_scale power_scale;
> u64 stats_addr;
> u32 stats_size;
> + u32 quirks;
> bool notify_lvl_cmd;
> bool notify_lim_cmd;
> struct perf_dom_info *dom_info;
> @@ -838,9 +843,10 @@ static int scmi_perf_level_limits_notify(const struct scmi_protocol_handle *ph,
> }
>
> static void scmi_perf_domain_init_fc(const struct scmi_protocol_handle *ph,
> - struct perf_dom_info *dom)
> + struct perf_dom_info *dom, u32 quirks)
> {
> struct scmi_fc_info *fc;
> + bool quirk_level_get = !!(quirks & BIT(PERF_QUIRK_SKIP_FASTCHANNEL_LEVEL_GET));
bool force_fastchannel = !!(quirks & PERF_QUIRK_FORCE_FASTCHANNEL);
>
> fc = devm_kcalloc(ph->dev, PERF_FC_MAX, sizeof(*fc), GFP_KERNEL);
> if (!fc)
regards,
dan carpenter
© 2016 - 2025 Red Hat, Inc.