diff options
author | Thomas Haller <thaller@redhat.com> | 2022-01-28 17:52:34 +0100 |
---|---|---|
committer | Thomas Haller <thaller@redhat.com> | 2022-02-02 11:08:31 +0100 |
commit | db0d84f13a4c5a73403474f8b4af4b905fa60d4b (patch) | |
tree | 4d8530abd258e73db742585d26a1c7b1ee8426ee | |
parent | d7a951b947c84c94f064b763408f1bc81da9d2fd (diff) |
l3cfg: fix handling "instance-reset" ACD event
The ACD state handling is unfortunately very complicated. That is, because
we obviously need to track state about how ACD is going (the acd_data, and
in particular NML3AcdAddrState). Then there are various things that can
happen, which are the AcdStateChangeMode enums. All these state-changes
come together in one function: _l3_acd_data_state_change(), which is
therefore complicated (I don't think that it would become simpler by
spreading this code out to different functions, on the contrary).
Anyway.
So, what happens when we need to reset the n-acd instance? For example,
because the MAC address of the link changed or some error. I guess, we
need to restart probing.
Previously, I think this was not handled properly. We already tried to
fix this several times, the last was commit b3316063862b ('l3cfg: on
n-acd instance-reset clear also ready ACD state'). There is still an
issue ([1]).
The bug [1] is, that we are in state NM_L3_ACD_ADDR_STATE_READY, during
ACD_STATE_CHANGE_MODE_TIMEOUT event. That leads to an assertion
failure.
#5 0x00007f23be74698f in g_assertion_message_expr (domain=0x5629aca70359 "nm", file=0x5629aca62aab "src/core/nm-l3cfg.c", line=2395, func=0x5629acb26b30 <__func__.72.lto_priv.4> "_l3_acd_data_state_change", expr=<optimized out>) at ../glib/gtestutils.c:3091
#6 0x00005629ac937e46 in _l3_acd_data_state_change (self=0x5629add69790, acd_data=0x5629add8d520, state_change_mode=ACD_STATE_CHANGE_MODE_TIMEOUT, sender_addr=0x0, p_now_msec=0x7ffded506460) at src/core/nm-l3cfg.c:2395
#7 0x00005629ac939f4d in _l3_acd_data_timeout_cb (user_data=user_data@entry=0x5629add8d520) at src/core/nm-l3cfg.c:1933
#8 0x00007f23be71c5a1 in g_timeout_dispatch (source=0x5629addd7a80, callback=0x5629ac939ee0 <_l3_acd_data_timeout_cb>, user_data=0x5629add8d520) at ../glib/gmain.c:4889
#9 0x00007f23be71bd4f in g_main_dispatch (context=0x5629adc6da00) at ../glib/gmain.c:3337
#10 g_main_context_dispatch (context=0x5629adc6da00) at ../glib/gmain.c:4055
That can only happen, (I think) when we scheduled the timeout
during an earlier ACD_STATE_CHANGE_MODE_INSTANCE_RESET event. Meaning,
we need to handle instance-reset better.
Instead, during instance-reset, switch always back to state PROBING, and
let the timeout figure it out.
[1] https://bugzilla.redhat.com/show_bug.cgi?id=2047788
-rw-r--r-- | src/core/nm-l3cfg.c | 87 |
1 files changed, 38 insertions, 49 deletions
diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c index 1ac477275d..aed4cf05c9 100644 --- a/src/core/nm-l3cfg.c +++ b/src/core/nm-l3cfg.c @@ -2351,22 +2351,7 @@ handle_init: /* we are already probing. There is nothing to do for this timeout. */ return; } - - nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec); - - if (acd_data->probing_timestamp_msec + ACD_WAIT_PROBING_EXTRA_TIME_MSEC - + ACD_WAIT_PROBING_EXTRA_TIME2_MSEC - >= (*p_now_msec)) { - /* hm. We failed to create a new probe too long. Something is really wrong - * internally, but let's ignore the issue and assume the address is good. What - * else would we do? Assume the address is USED? */ - _LOGT_acd(acd_data, - "probe-good (waiting for creating probe timed out. Assume good)"); - goto handle_start_defending; - } - - log_reason = "retry probing on timeout"; - goto handle_start_probing; + /* fall-through */ case NM_L3_ACD_ADDR_STATE_USED: case NM_L3_ACD_ADDR_STATE_CONFLICT: @@ -2386,7 +2371,10 @@ handle_init: acd_data->acd_defend_type_desired = acd_defend_type; if (acd_timeout_msec <= 0) { - log_reason = "acd disabled by configuration (restart after previous conflict)"; + if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_PROBING) + log_reason = "acd disabled by configuration (timeout during probing)"; + else + log_reason = "acd disabled by configuration (restart after previous conflict)"; goto handle_probing_done; } @@ -2396,6 +2384,23 @@ handle_init: } nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec); + + if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_PROBING) { + if (acd_data->probing_timestamp_msec + ACD_WAIT_PROBING_EXTRA_TIME_MSEC + + ACD_WAIT_PROBING_EXTRA_TIME2_MSEC + >= (*p_now_msec)) { + /* hm. We failed to create a new probe too long. Something is really wrong + * internally, but let's ignore the issue and assume the address is good. What + * else would we do? Assume the address is USED? */ + _LOGT_acd(acd_data, + "probe-good (waiting for creating probe timed out. Assume good)"); + goto handle_start_defending; + } + + log_reason = "retry probing on timeout"; + goto handle_start_probing; + } + acd_data->probing_timestamp_msec = (*p_now_msec); acd_data->probing_timeout_msec = acd_timeout_msec; if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_USED) @@ -2616,39 +2621,23 @@ handle_init: return; case ACD_STATE_CHANGE_MODE_INSTANCE_RESET: + nm_assert(NM_IN_SET(acd_data->info.state, + NM_L3_ACD_ADDR_STATE_PROBING, + NM_L3_ACD_ADDR_STATE_DEFENDING, + NM_L3_ACD_ADDR_STATE_READY, + NM_L3_ACD_ADDR_STATE_USED, + NM_L3_ACD_ADDR_STATE_CONFLICT, + NM_L3_ACD_ADDR_STATE_EXTERNAL_REMOVED)); - switch (acd_data->info.state) { - case NM_L3_ACD_ADDR_STATE_INIT: - nm_assert_not_reached(); - return; - case NM_L3_ACD_ADDR_STATE_PROBING: - case NM_L3_ACD_ADDR_STATE_DEFENDING: - case NM_L3_ACD_ADDR_STATE_READY: - if (!acd_data->nacd_probe) { - /* we failed starting to probe before and have a timer running to - * restart. We don't do anything now, but let the timer handle it. - * This also implements some rate limiting for us. */ - _LOGT_acd(acd_data, - "n-acd instance reset. Ignore event while in state %s", - _l3_acd_addr_state_to_string(acd_data->info.state)); - return; - } - - _LOGT_acd(acd_data, - "n-acd instance reset. Trigger a restart while in state %s", - _l3_acd_addr_state_to_string(acd_data->info.state)); - acd_data->nacd_probe = n_acd_probe_free(acd_data->nacd_probe); - _l3_acd_data_timeout_schedule(acd_data, 0); - return; - case NM_L3_ACD_ADDR_STATE_USED: - case NM_L3_ACD_ADDR_STATE_CONFLICT: - case NM_L3_ACD_ADDR_STATE_EXTERNAL_REMOVED: - /* as we have no probe, we are already handling this (e.g. by having - * a retry timer). Nothing to do. */ - nm_assert(!acd_data->nacd_probe); - return; - } - nm_assert_not_reached(); + /* an instance-reset is a dramatic event. We start over with probing. */ + _LOGT_acd(acd_data, + "n-acd instance reset. Reset to probing while in state %s", + _l3_acd_addr_state_to_string(acd_data->info.state)); + acd_data->nacd_probe = n_acd_probe_free(acd_data->nacd_probe); + acd_data->last_defendconflict_timestamp_msec = 0; + acd_data->probing_timestamp_msec = nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec); + _l3_acd_data_state_set(self, acd_data, NM_L3_ACD_ADDR_STATE_PROBING, FALSE); + _l3_acd_data_timeout_schedule(acd_data, 0); return; } |