l3cfg: fix handling "instance-reset" ACD event

The ACD state handling is unfortunately very complicated. That is, because we obviously need to track state about how ACD is going (the acd_data, and in particular NML3AcdAddrState). Then there are various things that can happen, which are the AcdStateChangeMode enums. All these state-changes come together in one function: _l3_acd_data_state_change(), which is therefore complicated (I don't think that it would become simpler by spreading this code out to different functions, on the contrary). Anyway. So, what happens when we need to reset the n-acd instance? For example, because the MAC address of the link changed or some error. I guess, we need to restart probing. Previously, I think this was not handled properly. We already tried to fix this several times, the last was commit b3316063862b ('l3cfg: on n-acd instance-reset clear also ready ACD state'). There is still an issue ([1]). The bug [1] is, that we are in state NM_L3_ACD_ADDR_STATE_READY, during ACD_STATE_CHANGE_MODE_TIMEOUT event. That leads to an assertion failure. #5 0x00007f23be74698f in g_assertion_message_expr (domain=0x5629aca70359 "nm", file=0x5629aca62aab "src/core/nm-l3cfg.c", line=2395, func=0x5629acb26b30 <__func__.72.lto_priv.4> "_l3_acd_data_state_change", expr=<optimized out>) at ../glib/gtestutils.c:3091 #6 0x00005629ac937e46 in _l3_acd_data_state_change (self=0x5629add69790, acd_data=0x5629add8d520, state_change_mode=ACD_STATE_CHANGE_MODE_TIMEOUT, sender_addr=0x0, p_now_msec=0x7ffded506460) at src/core/nm-l3cfg.c:2395 #7 0x00005629ac939f4d in _l3_acd_data_timeout_cb (user_data=user_data@entry=0x5629add8d520) at src/core/nm-l3cfg.c:1933 #8 0x00007f23be71c5a1 in g_timeout_dispatch (source=0x5629addd7a80, callback=0x5629ac939ee0 <_l3_acd_data_timeout_cb>, user_data=0x5629add8d520) at ../glib/gmain.c:4889 #9 0x00007f23be71bd4f in g_main_dispatch (context=0x5629adc6da00) at ../glib/gmain.c:3337 #10 g_main_context_dispatch (context=0x5629adc6da00) at ../glib/gmain.c:4055 That can only happen, (I think) when we scheduled the timeout during an earlier ACD_STATE_CHANGE_MODE_INSTANCE_RESET event. Meaning, we need to handle instance-reset better. Instead, during instance-reset, switch always back to state PROBING, and let the timeout figure it out. [1] https://bugzilla.redhat.com/show_bug.cgi?id=2047788
author: Thomas Haller <thaller@redhat.com> 2022-01-28 17:52:34 +0100
committer: Thomas Haller <thaller@redhat.com> 2022-02-02 11:08:31 +0100
commit: db0d84f13a4c5a73403474f8b4af4b905fa60d4b (patch)
tree: 4d8530abd258e73db742585d26a1c7b1ee8426ee
parent: d7a951b947c84c94f064b763408f1bc81da9d2fd (diff)
1 files changed, 38 insertions, 49 deletions
diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c
index 1ac477275d..aed4cf05c9 100644
--- a/src/core/nm-l3cfg.c
+++ b/src/core/nm-l3cfg.c
@@ -2351,22 +2351,7 @@ handle_init:
                 /* we are already probing. There is nothing to do for this timeout. */
                 return;
             }
-
-            nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec);
-
-            if (acd_data->probing_timestamp_msec + ACD_WAIT_PROBING_EXTRA_TIME_MSEC
-                    + ACD_WAIT_PROBING_EXTRA_TIME2_MSEC
-                >= (*p_now_msec)) {
-                /* hm. We failed to create a new probe too long. Something is really wrong
-                 * internally, but let's ignore the issue and assume the address is good. What
-                 * else would we do? Assume the address is USED? */
-                _LOGT_acd(acd_data,
-                          "probe-good (waiting for creating probe timed out. Assume good)");
-                goto handle_start_defending;
-            }
-
-            log_reason = "retry probing on timeout";
-            goto handle_start_probing;
+            /* fall-through */
 
         case NM_L3_ACD_ADDR_STATE_USED:
         case NM_L3_ACD_ADDR_STATE_CONFLICT:
@@ -2386,7 +2371,10 @@ handle_init:
             acd_data->acd_defend_type_desired = acd_defend_type;
 
             if (acd_timeout_msec <= 0) {
-                log_reason = "acd disabled by configuration (restart after previous conflict)";
+                if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_PROBING)
+                    log_reason = "acd disabled by configuration (timeout during probing)";
+                else
+                    log_reason = "acd disabled by configuration (restart after previous conflict)";
                 goto handle_probing_done;
             }
 
@@ -2396,6 +2384,23 @@ handle_init:
             }
 
             nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec);
+
+            if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_PROBING) {
+                if (acd_data->probing_timestamp_msec + ACD_WAIT_PROBING_EXTRA_TIME_MSEC
+                        + ACD_WAIT_PROBING_EXTRA_TIME2_MSEC
+                    >= (*p_now_msec)) {
+                    /* hm. We failed to create a new probe too long. Something is really wrong
+                     * internally, but let's ignore the issue and assume the address is good. What
+                     * else would we do? Assume the address is USED? */
+                    _LOGT_acd(acd_data,
+                              "probe-good (waiting for creating probe timed out. Assume good)");
+                    goto handle_start_defending;
+                }
+
+                log_reason = "retry probing on timeout";
+                goto handle_start_probing;
+            }
+
             acd_data->probing_timestamp_msec = (*p_now_msec);
             acd_data->probing_timeout_msec   = acd_timeout_msec;
             if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_USED)
@@ -2616,39 +2621,23 @@ handle_init:
         return;
 
     case ACD_STATE_CHANGE_MODE_INSTANCE_RESET:
+        nm_assert(NM_IN_SET(acd_data->info.state,
+                            NM_L3_ACD_ADDR_STATE_PROBING,
+                            NM_L3_ACD_ADDR_STATE_DEFENDING,
+                            NM_L3_ACD_ADDR_STATE_READY,
+                            NM_L3_ACD_ADDR_STATE_USED,
+                            NM_L3_ACD_ADDR_STATE_CONFLICT,
+                            NM_L3_ACD_ADDR_STATE_EXTERNAL_REMOVED));
 
-        switch (acd_data->info.state) {
-        case NM_L3_ACD_ADDR_STATE_INIT:
-            nm_assert_not_reached();
-            return;
-        case NM_L3_ACD_ADDR_STATE_PROBING:
-        case NM_L3_ACD_ADDR_STATE_DEFENDING:
-        case NM_L3_ACD_ADDR_STATE_READY:
-            if (!acd_data->nacd_probe) {
-                /* we failed starting to probe before and have a timer running to
-                 * restart. We don't do anything now, but let the timer handle it.
-                 * This also implements some rate limiting for us. */
-                _LOGT_acd(acd_data,
-                          "n-acd instance reset. Ignore event while in state %s",
-                          _l3_acd_addr_state_to_string(acd_data->info.state));
-                return;
-            }
-
-            _LOGT_acd(acd_data,
-                      "n-acd instance reset. Trigger a restart while in state %s",
-                      _l3_acd_addr_state_to_string(acd_data->info.state));
-            acd_data->nacd_probe = n_acd_probe_free(acd_data->nacd_probe);
-            _l3_acd_data_timeout_schedule(acd_data, 0);
-            return;
-        case NM_L3_ACD_ADDR_STATE_USED:
-        case NM_L3_ACD_ADDR_STATE_CONFLICT:
-        case NM_L3_ACD_ADDR_STATE_EXTERNAL_REMOVED:
-            /* as we have no probe, we are already handling this (e.g. by having
-             * a retry timer). Nothing to do. */
-            nm_assert(!acd_data->nacd_probe);
-            return;
-        }
-        nm_assert_not_reached();
+        /* an instance-reset is a dramatic event. We start over with probing. */
+        _LOGT_acd(acd_data,
+                  "n-acd instance reset. Reset to probing while in state %s",
+                  _l3_acd_addr_state_to_string(acd_data->info.state));
+        acd_data->nacd_probe                         = n_acd_probe_free(acd_data->nacd_probe);
+        acd_data->last_defendconflict_timestamp_msec = 0;
+        acd_data->probing_timestamp_msec = nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec);
+        _l3_acd_data_state_set(self, acd_data, NM_L3_ACD_ADDR_STATE_PROBING, FALSE);
+        _l3_acd_data_timeout_schedule(acd_data, 0);
         return;
     }
author	Thomas Haller <thaller@redhat.com>	2022-01-28 17:52:34 +0100
committer	Thomas Haller <thaller@redhat.com>	2022-02-02 11:08:31 +0100
commit	db0d84f13a4c5a73403474f8b4af4b905fa60d4b (patch)
tree	4d8530abd258e73db742585d26a1c7b1ee8426ee
parent	d7a951b947c84c94f064b763408f1bc81da9d2fd (diff)