Home | History | Annotate | Line # | Download | only in ServiceRegistration
      1 /* srp-replication.c
      2  *
      3  * Copyright (c) 2020-2023 Apple Inc. All rights reserved.
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at
      8  *
      9  *     https://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  * This file contains an implementation of SRP Replication, which allows two or more
     18  * SRP servers to cooperatively maintain an SRP registration dataset.
     19  */
     20 
     21 #include <stdlib.h>
     22 #include <string.h>
     23 #include <stdio.h>
     24 #include <unistd.h>
     25 #include <pwd.h>
     26 #include <errno.h>
     27 #include <sys/socket.h>
     28 #include <netinet/in.h>
     29 #include <arpa/inet.h>
     30 #include <fcntl.h>
     31 #include <time.h>
     32 #include <dns_sd.h>
     33 #include <net/if.h>
     34 #include <inttypes.h>
     35 #include <sys/resource.h>
     36 #include <math.h>
     37 #include <CoreUtils/CoreUtils.h>
     38 
     39 #include "srp.h"
     40 #include "dns-msg.h"
     41 #include "srp-crypto.h"
     42 #include "ioloop.h"
     43 #include "srp-gw.h"
     44 #include "srp-proxy.h"
     45 #include "srp-mdns-proxy.h"
     46 #include "dnssd-proxy.h"
     47 #include "config-parse.h"
     48 #include "cti-services.h"
     49 #include "route.h"
     50 #define DNSMessageHeader dns_wire_t
     51 #include "dso.h"
     52 #include "dso-utils.h"
     53 
     54 #if SRP_FEATURE_REPLICATION
     55 #include "srp-replication.h"
     56 #ifdef SRP_TEST_SERVER
     57 #include "test-packet.h"
     58 #include "test-srpl.h"
     59 #endif
     60 
     61 
     62 #define SRPL_CONNECTION_IS_CONNECTED(connection) ((connection)->state > srpl_state_connecting)
     63 
     64 #define srpl_event_content_type_set(event, content_type) \
     65     srpl_event_content_type_set_(event, content_type, __FILE__, __LINE__)
     66 static bool srpl_event_content_type_set_(srpl_event_t *event,
     67                                          srpl_event_content_type_t content_type, const char *file, int line);
     68 static srpl_state_t srpl_connection_drop_state_delay(srpl_instance_t *instance,
     69                                                      srpl_connection_t *srpl_connection, int delay);
     70 static srpl_state_t srpl_connection_drop_state(srpl_instance_t *instance, srpl_connection_t *srpl_connection);
     71 static void srpl_disconnect(srpl_connection_t *srpl_connection);
     72 static void srpl_connection_discontinue(srpl_connection_t *srpl_connection);
     73 static void srpl_event_initialize(srpl_event_t *event, srpl_event_type_t event_type);
     74 static void srpl_event_deliver(srpl_connection_t *srpl_connection, srpl_event_t *event);
     75 static void srpl_domain_advertise(srpl_domain_t *domain);
     76 static void srpl_connection_finalize(srpl_connection_t *srpl_connection);
     77 static void srpl_instance_address_query_reset(srpl_instance_t *instance);
     78 static void srpl_instance_reconnect(srpl_instance_t *instance);
     79 static void srpl_instance_reconnect_callback(void *context);
     80 static bool srpl_domain_browse_start(srpl_domain_t *domain);
     81 static const char *srpl_state_name(srpl_state_t state);
     82 static bool srpl_can_transition_to_routine_state(srpl_domain_t *domain);
     83 static void srpl_transition_to_routine_state(srpl_domain_t *domain);
     84 static void srpl_message_sent(srpl_connection_t *srpl_connection);
     85 static srpl_state_t srpl_connection_schedule_reconnect_event(srpl_connection_t *srpl_connection, uint32_t when);
     86 static void srpl_partner_discovery_timeout(void *context);
     87 static void srpl_instance_services_discontinue(srpl_instance_t *instance);
     88 static void srpl_instance_service_discontinue_timeout(void *context);
     89 static void srpl_maybe_sync_or_transition(srpl_domain_t *domain);
     90 static int srpl_dataset_id_compare(uint64_t id1, uint64_t id2);
     91 static void srpl_state_transition_by_dataset_id(srpl_domain_t *domain, srpl_instance_t *instance);
     92 
     93 #define EQUI_DISTANCE64 (int64_t)0x8000000000000000
     94 #define MAX_ADDITIONAL_HOST_MESSAGES 32
     95 #define SRPL_STATE_TIMEOUT (30 * 1000) // state timeout in milliseconds
     96 #define SECONDS_IN_MINUTE  60
     97 #define SECONDS_IN_HOUR    (60 * 60)
     98 #define SECONDS_IN_DAY     (24 * SECONDS_IN_HOUR)
     99 
    100 #ifdef DEBUG
    101 #define STATE_DEBUGGING_ABORT() abort();
    102 #else
    103 #define STATE_DEBUGGING_ABORT()
    104 #endif
    105 
    106 // Send reconfirm records for all queries relating to this connection.
    107 static void
    108 srpl_reconfirm(srpl_connection_t *connection)
    109 {
    110     // If there's no instance, that's why we got here, so no need for reconfirms.
    111     if (connection->instance == NULL) {
    112         INFO("no instance");
    113         return;
    114     }
    115     srpl_instance_t *instance = connection->instance;
    116 
    117     for (srpl_instance_service_t *service = instance->services; service != NULL; service = service->next) {
    118         if (!service->got_new_info) {
    119             INFO("we haven't had any new information since the last time we did a reconfirm, so no point doing it again.");
    120             continue;
    121         }
    122         service->got_new_info = false;
    123 
    124         if (service->full_service_name != NULL && service->ptr_rdata != NULL) {
    125             // The service name is the service instance name minus the first label.
    126             char *service_type = strchr(service->full_service_name, '.');
    127             if (service_type != NULL) {
    128                 service_type++; // Skip the '.'
    129                 // Send a reconfirm for the PTR record
    130                 DNSServiceReconfirmRecord(0, service->ifindex, service_type, dns_rrtype_ptr, dns_qclass_in,
    131                                           service->ptr_length, service->ptr_rdata);
    132             }
    133             if (service->srv_rdata != NULL) {
    134                 DNSServiceReconfirmRecord(0, service->ifindex, service->full_service_name, dns_rrtype_srv, dns_qclass_in,
    135                                           service->srv_length, service->srv_rdata);
    136             }
    137             if (service->txt_rdata != NULL) {
    138                 DNSServiceReconfirmRecord(0, service->ifindex, service->full_service_name, dns_rrtype_txt, dns_qclass_in,
    139                                           service->txt_length, service->txt_rdata);
    140             }
    141             if (service->address_query != NULL) {
    142                 address_query_t *query = service->address_query;
    143                 for (int i = 0; i > query->num_addresses; i++) {
    144                     if (query->addresses[i].sa.sa_family == AF_INET) {
    145                         DNSServiceReconfirmRecord(0, query->address_interface[i], query->hostname, dns_rrtype_a,
    146                                                   dns_qclass_in, 4, &query->addresses[i].sin.sin_addr);
    147                     } else if (query->addresses[i].sa.sa_family == AF_INET6) {
    148                         DNSServiceReconfirmRecord(0, query->address_interface[i], query->hostname, dns_rrtype_aaaa,
    149                                                   dns_qclass_in, 16, &query->addresses[i].sin6.sin6_addr);
    150                     }
    151                 }
    152             }
    153         }
    154     }
    155 }
    156 
    157 //
    158 // 1. Enumerate all SRP servers that are participating in synchronization on infrastructure: This is done by looking up
    159 //    NS records for <thread-network-name>.thread.home.arpa with the ForceMulticast flag set so that we use mDNS to
    160 //    discover them.
    161 
    162 // 2. For each identified server that is not this server, look up A and AAAA records for the server's hostname.
    163 
    164 // 3. Maintain a state object with the list of IP addresses and an index to the current server being tried.
    165 
    166 // 4. Try to connect to the first address on the list -> connection management state machine:
    167 
    168 //   * When we have established an outgoing connection to a server, generate a random 64-bit unsigned number and send a
    169 //     SRPLSession DSO message using that number as the server ID.
    170 
    171 //   * When we receive an SRPLSession DSO message, see if we have an outgoing connection from the same server
    172 //     for which we've sent a server ID. If so, and if the server id we received is less than the one we sent,
    173 //     terminate the outgoing connection. If the server ids are equal, generate a new server id for the outgoing
    174 //     connection and send another session establishment message.
    175 //
    176 //   * When we receive an acknowledgement to our SRPLSession DSO message, see if we have an incoming
    177 //     connection from the same server from which we've received a server id. If the server id we received is less
    178 //     than the one we sent, terminate the outgoing connection. If the server ids are equal, generate a new random
    179 //     number for the outgoing connection and send another session establishment message.
    180 //
    181 //   * When a connection from a server is terminated, see if we have an established outgoing connection with that
    182 //     server.  If not, attempt to connect to the next address we have for that server.
    183 //
    184 //   * When our connection to a server is terminated or fails, and there is no established incoming connection from that
    185 //     server, attempt to connect to the next address we have for that server.
    186 //
    187 //   * When the NS record for a server goes away, drop any outgoing connection to that server and discontinue trying to
    188 //     connect to it.
    189 //
    190 // 5. When we have established a session, meaning that we either got an acknowledgment to a SRPLSession DSO
    191 //    message that we sent and _didn't_ drop the connection, or we got an SRPLSession DSO message on an
    192 //    incoming connection with a lower server id than the outgoing connection, we begin the synchronization process.
    193 //
    194 //    * Immediately following session establishment, we generate a list of candidate hosts to send to the other server
    195 //      from our internal list of SRP hosts (clients). Every non-expired host entry goes into the candidate list.
    196 //
    197 //    * Then, if we are the originator, we sent an SRPLSendCandidates message.
    198 //
    199 //    * If we are the recipient, we wait for an SRPLSendCandidates message.
    200 //
    201 //    * When we receive an SRPLSendCandidates message, we iterate across the candidates list, for each
    202 //      candidate sending an SRPLCandidate message containing the host key, current time, and last message
    203 //      received times, in seconds since the epoch. When we come to the end of the candidates list, we send an
    204 //      acknowledgement to the SRPLSendCandidates message and discard the candidates list.
    205 //
    206 //    * When we receive an SRPLCandidate message, we look in our own candidate list, if there is one, to see
    207 //      if the host key is present in the candidates list. If it is not present, or if it is present and the received
    208 //      time from the SRPLCandidate message is later than the time we have recorded in our own candidate
    209 //      list, we send an SRPLCandidateRequest message with the host key from the SRPLCandidate
    210 //      message.
    211 //
    212 //    * When we receive an SRPLCandidateRequest message, we send an SRPLHost message which
    213 //      encapsulates the SRP update for the host and includes the timestamp when we received the SRP update from that
    214 //      host, which may have changed since we sent the SRPLCandidate message.
    215 //
    216 //    * When we receive an SRPLHost message, we look in our list of hosts (SRP clients) for a matching
    217 //      host. If no such host exists, or if it exists and the timestamp is less than the timestamp in the
    218 //      SRPLHost message, we process the SRP update from the SRPLHost message and mark the host as
    219 //      "not received locally."  In other words, this message was not received directly from an SRP client, but rather
    220 //      indirectly through our SRP replication partner. Note that this message cannot be assumed to be syntactically
    221 //      correct and must be treated like any other data received from the network. If we are sent an invalid message,
    222 //      this is an indication that our partner is broken in some way, since it should have validated the message before
    223 //      accepting it.
    224 //
    225 //    * Whenever the SRP engine applies an SRP update from a host, it also delivers that update to each replication
    226 //      server state engine.
    227 //
    228 //      * That replication server state engine first checks to see if it is connected to its partner; if not, no action
    229 //        is taken.
    230 //
    231 //      * It then checks to see if there is a candidate list.
    232 //        * If so, it checks to see if the host implicated in the update is already on the candidate list.
    233 //          * If so, it updates that candidate's update time.
    234 //          * If not, it adds the host to the end of the candidate list.
    235 //        * If not, it sends an SRPLCandidate message to the other replication server, with the host
    236 //          key and new timestamp.
    237 //
    238 // 6. When there is more than one SRP server participating in replication, only one server should advertise using
    239 //    mDNS. All other servers should only advertise using DNS and DNS Push (SRP scalability feature). The SRP server
    240 //    with the lowest numbered server ID is the one that acts as an advertising proxy for SRP. In practice this means
    241 //    that if we have the lowest server ID of all the SRP servers we are connected to, we advertise mDNS. If two servers
    242 //    on the same link can't connect to each other, they probably can't see each others' multicasts, so this is the
    243 //    right outcome.
    244 
    245 static bool
    246 ip_addresses_equal(const addr_t *a, const addr_t *b)
    247 {
    248     return (a->sa.sa_family == b->sa.sa_family &&
    249             ((a->sa.sa_family == AF_INET && !memcmp(&a->sin.sin_addr, &b->sin.sin_addr, 4)) ||
    250              (a->sa.sa_family == AF_INET6 && !memcmp(&a->sin6.sin6_addr, &b->sin6.sin6_addr, 16))));
    251 }
    252 
    253 #define ADDR_NAME_LOGGER(log_type, address, preamble, conjunction, number, fullname, interfaceIndex)            \
    254     if ((address)->sa.sa_family == AF_INET6) {                                                                  \
    255         SEGMENTED_IPv6_ADDR_GEN_SRP(&(address)->sin6.sin6_addr, rdata_buf);                                     \
    256         log_type(PUB_S_SRP PRI_SEGMENTED_IPv6_ADDR_SRP PUB_S_SRP PRI_S_SRP PUB_S_SRP "%d", preamble,            \
    257                  SEGMENTED_IPv6_ADDR_PARAM_SRP(&(address)->sin6.sin6_addr, rdata_buf),                          \
    258                  conjunction, fullname, number, interfaceIndex);                                                \
    259     } else {                                                                                                    \
    260         IPv4_ADDR_GEN_SRP(&(address)->sin.sin_addr, rdata_buf);                                                 \
    261         log_type(PUB_S_SRP PRI_IPv4_ADDR_SRP PUB_S_SRP PRI_S_SRP PUB_S_SRP "%d", preamble,                      \
    262                  IPv4_ADDR_PARAM_SRP(&(address)->sin.sin_addr, rdata_buf),                                      \
    263                  conjunction, fullname, number, interfaceIndex);                                                \
    264     }
    265 
    266 static void
    267 address_query_callback(DNSServiceRef UNUSED sdRef, DNSServiceFlags flags, uint32_t interfaceIndex,
    268                        DNSServiceErrorType errorCode, const char *fullname, uint16_t rrtype, uint16_t rrclass,
    269                        uint16_t rdlen, const void *rdata, uint32_t UNUSED ttl, void *context)
    270 {
    271     address_query_t *address = context;
    272     addr_t addr;
    273     int i, j;
    274 
    275     if (errorCode != kDNSServiceErr_NoError) {
    276         ERROR("address resolution for " PRI_S_SRP " failed with %d", fullname, errorCode);
    277         address->change_callback(address->context, NULL, false, flags & kDNSServiceFlagsMoreComing, errorCode);
    278         return;
    279     }
    280     if (rrclass != dns_qclass_in || ((rrtype != dns_rrtype_a || rdlen != 4) &&
    281                                      (rrtype != dns_rrtype_aaaa || rdlen != 16))) {
    282         ERROR("Invalid response record type (%d) or class (%d) provided for " PRI_S_SRP, rrtype, rrclass, fullname);
    283         return;
    284     }
    285 
    286     memset(&addr, 0, sizeof(addr));
    287     if (rrtype == dns_rrtype_a) {
    288 #ifndef NOT_HAVE_SA_LEN
    289         addr.sa.sa_len = sizeof(struct sockaddr_in);
    290 #endif
    291         addr.sa.sa_family = AF_INET;
    292         memcpy(&addr.sin.sin_addr, rdata, rdlen);
    293         if (IN_LINKLOCAL(addr.sin.sin_addr.s_addr)) {
    294             ADDR_NAME_LOGGER(INFO, &addr, "Skipping link-local address ", " received for instance ", " index ",
    295                              fullname, interfaceIndex);
    296             return;
    297         }
    298     } else {
    299 #ifndef NOT_HAVE_SA_LEN
    300         addr.sa.sa_len = sizeof(struct sockaddr_in6);
    301 #endif
    302         addr.sa.sa_family = AF_INET6;
    303         memcpy(&addr.sin6.sin6_addr, rdata, rdlen);
    304         if (IN6_IS_ADDR_LINKLOCAL(&addr.sin6.sin6_addr)) {
    305             ADDR_NAME_LOGGER(INFO, &addr, "Skipping link-local address ", " received for instance ", " index ",
    306                              fullname, interfaceIndex);
    307             return;
    308         }
    309     }
    310 
    311     for (i = 0, j = 0; i < address->num_addresses; i++) {
    312         // Already in the list?
    313         if (address->address_interface[i] == interfaceIndex && !memcmp(&address->addresses[i], &addr, sizeof(addr))) {
    314             if (flags & kDNSServiceFlagsAdd) {
    315                 ADDR_NAME_LOGGER(INFO, &addr, "Duplicate address ", " received for instance ", " index ",
    316                                  fullname, interfaceIndex);
    317                 return;
    318             } else {
    319                 ADDR_NAME_LOGGER(INFO, &addr, "Removing address ", " from instance ", " index ",
    320                                             fullname, interfaceIndex);
    321 
    322                 // If we're removing an address, we keep going through the array copying down.
    323                 if (address->cur_address >= i) {
    324                     address->cur_address--;
    325                 }
    326             }
    327         } else {
    328             // Copy down.
    329             if (i != j) {
    330                 address->addresses[j] = address->addresses[i];
    331                 address->address_interface[j] = address->address_interface[i];
    332             }
    333             j++;
    334         }
    335     }
    336     if (flags & kDNSServiceFlagsAdd) {
    337         if (i == ADDRESS_QUERY_MAX_ADDRESSES) {
    338             ADDR_NAME_LOGGER(ERROR, &addr, "No room for address ", " received for ", " index ",
    339                                          fullname, interfaceIndex);
    340             return;
    341         }
    342 
    343         ADDR_NAME_LOGGER(INFO, &addr, "Adding address ", " to ", " index ", fullname, interfaceIndex);
    344 
    345         address->addresses[i] = addr;
    346         address->address_interface[i] = interfaceIndex;
    347         address->num_addresses++;
    348         address->change_callback(address->context, &address->addresses[i], true, flags & kDNSServiceFlagsMoreComing, kDNSServiceErr_NoError);
    349     } else {
    350         if (i == j) {
    351             ADDR_NAME_LOGGER(ERROR, &addr, "Remove for unknown address ", " received for ", " index ",
    352                                fullname, interfaceIndex);
    353             return;
    354         } else {
    355             address->num_addresses--;
    356             address->change_callback(address->context, &addr, false, flags & kDNSServiceFlagsMoreComing, kDNSServiceErr_NoError);
    357         }
    358     }
    359 }
    360 
    361 static void
    362 address_query_finalize(void *context)
    363 {
    364     address_query_t *address = context;
    365     free(address->hostname);
    366     free(address);
    367 }
    368 
    369 static void
    370 address_query_cancel(address_query_t *address)
    371 {
    372     if (address->a_query != NULL) {
    373         ioloop_dnssd_txn_cancel(address->a_query);
    374         ioloop_dnssd_txn_release(address->a_query);
    375         address->a_query = NULL;
    376     }
    377     if (address->aaaa_query != NULL) {
    378         ioloop_dnssd_txn_cancel(address->aaaa_query);
    379         ioloop_dnssd_txn_release(address->aaaa_query);
    380         address->aaaa_query = NULL;
    381     }
    382 
    383     // Have whatever holds a reference to the address query let go of it.
    384     if (address->cancel_callback != NULL && address->context != NULL) {
    385         address->cancel_callback(address->context);
    386         address->context = NULL;
    387         address->cancel_callback = NULL;
    388     }
    389 }
    390 
    391 static void
    392 address_query_txn_fail(void *context, int err)
    393 {
    394     address_query_t *address = context;
    395     ERROR("address query " PRI_S_SRP " i/o failure: %d", address->hostname, err);
    396     address_query_cancel(address);
    397 }
    398 
    399 static void
    400 address_query_context_release(void *context)
    401 {
    402     address_query_t *address = context;
    403     RELEASE_HERE(address, address_query);
    404 }
    405 
    406 static address_query_t *
    407 address_query_create(const char *hostname, void *context, address_change_callback_t change_callback,
    408                      address_query_cancel_callback_t cancel_callback)
    409 {
    410     address_query_t *address = calloc(1, sizeof(*address));
    411     DNSServiceRef sdref;
    412     dnssd_txn_t **txn;
    413 
    414     require_action_quiet(address != NULL, exit_no_free, ERROR("No memory for address query."));
    415     RETAIN_HERE(address, address_query); // We return a retained object, or free it.
    416     address->hostname = strdup(hostname);
    417     require_action_quiet(address->hostname != NULL, exit, ERROR("No memory for address query hostname."));
    418 
    419     for (int i = 0; i < 2; i++) {
    420         int ret = DNSServiceQueryRecord(&sdref, kDNSServiceFlagsForceMulticast | kDNSServiceFlagsLongLivedQuery,
    421                                         kDNSServiceInterfaceIndexAny, hostname, (i
    422                                                                                  ? kDNSServiceType_A
    423                                                                                  : kDNSServiceType_AAAA),
    424                                         kDNSServiceClass_IN, address_query_callback, address);
    425         require_action_quiet(ret == kDNSServiceErr_NoError, exit,
    426                              ERROR("Unable to resolve instance hostname " PRI_S_SRP " addresses: %d",
    427                                    hostname, ret));
    428 
    429         txn = i ? &address->a_query : &address->aaaa_query;
    430         *txn = ioloop_dnssd_txn_add(sdref, address, address_query_context_release, address_query_txn_fail);
    431         require_action_quiet(*txn != NULL, exit,
    432                              ERROR("Unable to set up ioloop transaction for " PRI_S_SRP " query on " THREAD_BROWSING_DOMAIN,
    433                                    hostname);
    434                              DNSServiceRefDeallocate(sdref));
    435         RETAIN_HERE(address, address_query); // For the QueryRecord context
    436     }
    437     address->change_callback = change_callback;
    438     address->cancel_callback = cancel_callback;
    439     address->context = context;
    440     address->cur_address = -1;
    441     return address;
    442 
    443 exit:
    444     if (address->a_query != NULL) {
    445         ioloop_dnssd_txn_cancel(address->a_query);
    446         ioloop_dnssd_txn_release(address->a_query);
    447         address->a_query = NULL;
    448     }
    449     if (address->aaaa_query != NULL) { // Un-possible right now, but better safe than sorry in case of future change
    450         ioloop_dnssd_txn_cancel(address->aaaa_query);
    451         ioloop_dnssd_txn_release(address->aaaa_query);
    452         address->aaaa_query = NULL;
    453     }
    454     RELEASE_HERE(address, address_query);
    455     address = NULL;
    456 
    457 exit_no_free:
    458     return address;
    459 }
    460 
    461 static void
    462 srpl_domain_finalize(srpl_domain_t *domain)
    463 {
    464     srpl_instance_t *instance, *next;
    465     srpl_instance_service_t *service, *next_service;
    466 
    467     free(domain->name);
    468     if (domain->query != NULL) {
    469         ioloop_dnssd_txn_cancel(domain->query);
    470         ioloop_dnssd_txn_release(domain->query);
    471     }
    472 
    473     for (instance = domain->instances; instance != NULL; instance = next) {
    474         next = instance->next;
    475         srpl_instance_services_discontinue(instance);
    476     }
    477     for (service = domain->unresolved_services; service != NULL; service = next_service) {
    478         next_service = service->next;
    479         srpl_instance_service_discontinue_timeout(service);
    480     }
    481     if (domain->query != NULL) {
    482         ioloop_dnssd_txn_cancel(domain->query);
    483         ioloop_dnssd_txn_release(domain->query);
    484         domain->query = NULL;
    485     }
    486     if (domain->srpl_advertise_txn != NULL) {
    487         ioloop_dnssd_txn_cancel(domain->srpl_advertise_txn);
    488         ioloop_dnssd_txn_release(domain->srpl_advertise_txn);
    489         domain->srpl_advertise_txn = NULL;
    490     }
    491     if (domain->srpl_register_wakeup != NULL) {
    492         ioloop_cancel_wake_event(domain->srpl_register_wakeup);
    493         ioloop_wakeup_release(domain->srpl_register_wakeup);
    494         domain->srpl_register_wakeup = NULL;
    495     }
    496     if (domain->partner_discovery_timeout != NULL) {
    497         ioloop_cancel_wake_event(domain->partner_discovery_timeout);
    498         ioloop_wakeup_release(domain->partner_discovery_timeout);
    499         domain->partner_discovery_timeout = NULL;
    500     }
    501 
    502     free(domain);
    503 }
    504 
    505 static void srpl_instance_finalize(srpl_instance_t *instance);
    506 
    507 static void
    508 srpl_instance_service_finalize(srpl_instance_service_t *service)
    509 {
    510     if (service->domain != NULL) {
    511         RELEASE_HERE(service->domain, srpl_domain);
    512     }
    513     if (service->txt_txn != NULL) {
    514         ioloop_dnssd_txn_cancel(service->txt_txn);
    515         ioloop_dnssd_txn_release(service->txt_txn);
    516         service->txt_txn = NULL;
    517     }
    518     if (service->srv_txn != NULL) {
    519         ioloop_dnssd_txn_cancel(service->srv_txn);
    520         ioloop_dnssd_txn_release(service->srv_txn);
    521         service->srv_txn = NULL;
    522     }
    523     free(service->full_service_name);
    524     free(service->host_name);
    525     free(service->txt_rdata);
    526     free(service->srv_rdata);
    527     free(service->ptr_rdata);
    528     if (service->address_query != NULL) {
    529         address_query_cancel(service->address_query);
    530         RELEASE_HERE(service->address_query, address_query);
    531         service->address_query = NULL;
    532     }
    533     if (service->discontinue_timeout != NULL) {
    534         ioloop_cancel_wake_event(service->discontinue_timeout);
    535         ioloop_wakeup_release(service->discontinue_timeout);
    536         service->discontinue_timeout = NULL;
    537     }
    538     if (service->resolve_wakeup != NULL) {
    539         ioloop_cancel_wake_event(service->resolve_wakeup);
    540         ioloop_wakeup_release(service->resolve_wakeup);
    541         service->resolve_wakeup = NULL;
    542     }
    543     if (service->instance != NULL) {
    544         RELEASE_HERE(service->instance, srpl_instance);
    545         service->instance = NULL;
    546     }
    547     free(service);
    548 }
    549 
    550 static void
    551 srpl_instance_finalize(srpl_instance_t *instance)
    552 {
    553     if (instance->domain != NULL) {
    554         RELEASE_HERE(instance->domain, srpl_domain);
    555         instance->domain = NULL;
    556     }
    557     free(instance->instance_name);
    558     if (instance->connection != NULL) {
    559         srpl_connection_discontinue(instance->connection);
    560         RELEASE_HERE(instance->connection, srpl_connection);
    561         instance->connection = NULL;
    562     }
    563     if (instance->reconnect_timeout != NULL) {
    564         ioloop_cancel_wake_event(instance->reconnect_timeout);
    565         ioloop_wakeup_release(instance->reconnect_timeout);
    566         instance->reconnect_timeout = NULL;
    567     }
    568 
    569     srpl_instance_service_t *service = instance->services, *next;
    570     while (service != NULL) {
    571         next = service->next;
    572         RELEASE_HERE(service, srpl_instance_service);
    573         service = next;
    574     }
    575     instance->services = NULL;
    576     free(instance);
    577 }
    578 
    579 #define srpl_connection_message_set(srpl_connection, message) \
    580     srpl_connection_message_set_(srpl_connection, message, __FILE__, __LINE__)
    581 static void
    582 srpl_connection_message_set_(srpl_connection_t *srpl_connection, message_t *message, const char *file, int line)
    583 {
    584     if (srpl_connection->message != NULL) {
    585         ioloop_message_release_(srpl_connection->message, file, line);
    586         srpl_connection->message = NULL;
    587     }
    588     if (message != NULL) {
    589         srpl_connection->message = message;
    590         ioloop_message_retain_(srpl_connection->message, file, line);
    591     }
    592 }
    593 
    594 static message_t *
    595 srpl_connection_message_get(srpl_connection_t *srpl_connection)
    596 {
    597     return srpl_connection->message;
    598 }
    599 
    600 #define srpl_candidate_free(candidate) srpl_candidate_free_(candidate, __FILE__, __LINE__)
    601 static void
    602 srpl_candidate_free_(srpl_candidate_t *candidate, const char *file, int line)
    603 {
    604     if (candidate != NULL) {
    605         if (candidate->name != NULL) {
    606             dns_name_free(candidate->name);
    607             candidate->name = NULL;
    608         }
    609         if (candidate->message != NULL) {
    610             ioloop_message_release_(candidate->message, file, line);
    611             candidate->message = NULL;
    612         }
    613         if (candidate->host != NULL) {
    614             srp_adv_host_release_(candidate->host, file, line);
    615             candidate->host = NULL;
    616         }
    617         free(candidate);
    618     }
    619 }
    620 
    621 static void
    622 srpl_connection_candidates_free(srpl_connection_t *srpl_connection)
    623 {
    624     if (srpl_connection->candidates == NULL) {
    625         goto out;
    626     }
    627     for (int i = 0; i < srpl_connection->num_candidates; i++) {
    628         if (srpl_connection->candidates[i] != NULL) {
    629             srp_adv_host_release(srpl_connection->candidates[i]);
    630         }
    631     }
    632     free(srpl_connection->candidates);
    633     srpl_connection->candidates = NULL;
    634 out:
    635     srpl_connection->num_candidates = srpl_connection->current_candidate = 0;
    636     return;
    637 }
    638 
    639 static void
    640 srpl_srp_client_update_queue_free(srpl_connection_t *srpl_connection)
    641 {
    642     srpl_srp_client_queue_entry_t **cp = &srpl_connection->client_update_queue;
    643     while (*cp) {
    644         srpl_srp_client_queue_entry_t *entry = *cp;
    645         srp_adv_host_release(entry->host);
    646         *cp = entry->next;
    647         free(entry);
    648     }
    649 }
    650 
    651 static void
    652 srpl_connection_candidate_set(srpl_connection_t *srpl_connection, srpl_candidate_t *candidate)
    653 {
    654     if (srpl_connection->candidate != NULL) {
    655         srpl_candidate_free(srpl_connection->candidate);
    656     }
    657     srpl_connection->candidate = candidate;
    658 }
    659 
    660 static void
    661 srpl_host_update_parts_free(srpl_host_update_t *update)
    662 {
    663     if (update->messages != NULL) {
    664         for (int i = 0; i < update->num_messages; i++) {
    665             ioloop_message_release(update->messages[i]);
    666         }
    667         free(update->messages);
    668         update->messages = NULL;
    669         update->num_messages = update->max_messages = update->messages_processed = 0;
    670     }
    671     if (update->hostname != NULL) {
    672         dns_name_free(update->hostname);
    673         update->hostname = NULL;
    674     }
    675 }
    676 
    677 // Free up any temporarily retained or allocated objects on the connection (i.e., not the name).
    678 static void
    679 srpl_connection_reset(srpl_connection_t *srpl_connection)
    680 {
    681     srpl_connection->candidates_not_generated = true;
    682     srpl_connection->database_synchronized = false;
    683     srpl_host_update_parts_free(&srpl_connection->stashed_host);
    684     srpl_connection_message_set(srpl_connection, NULL);
    685     if (srpl_connection->candidate != NULL) {
    686         srpl_candidate_free(srpl_connection->candidate);
    687         srpl_connection->candidate = NULL;
    688     }
    689 
    690     // Cancel keepalive timers
    691     if (srpl_connection->keepalive_send_wakeup) {
    692         ioloop_cancel_wake_event(srpl_connection->keepalive_send_wakeup);
    693     }
    694     if (srpl_connection->keepalive_receive_wakeup) {
    695         ioloop_cancel_wake_event(srpl_connection->keepalive_receive_wakeup);
    696     }
    697 
    698     srpl_connection_candidates_free(srpl_connection);
    699     srpl_srp_client_update_queue_free(srpl_connection);
    700 }
    701 
    702 static void
    703 srpl_connection_finalize(srpl_connection_t *srpl_connection)
    704 {
    705     if (srpl_connection->instance) {
    706         RELEASE_HERE(srpl_connection->instance, srpl_instance);
    707         srpl_connection->instance = NULL;
    708     }
    709     if (srpl_connection->connection != NULL) {
    710         ioloop_comm_release(srpl_connection->connection);
    711         srpl_connection->connection = NULL;
    712         gettimeofday(&srpl_connection->connection_null_time, NULL);
    713         srpl_connection->connection_null_reason = "finalize"; // obvsly should never see this!
    714     }
    715     if (srpl_connection->reconnect_wakeup != NULL) {
    716         ioloop_cancel_wake_event(srpl_connection->reconnect_wakeup);
    717         ioloop_wakeup_release(srpl_connection->reconnect_wakeup);
    718         srpl_connection->reconnect_wakeup = NULL;
    719     }
    720     if (srpl_connection->state_timeout != NULL) {
    721         ioloop_cancel_wake_event(srpl_connection->state_timeout);
    722         ioloop_wakeup_release(srpl_connection->state_timeout);
    723         srpl_connection->state_timeout = NULL;
    724     }
    725     if (srpl_connection->keepalive_send_wakeup != NULL) {
    726         ioloop_cancel_wake_event(srpl_connection->keepalive_send_wakeup);
    727         ioloop_wakeup_release(srpl_connection->keepalive_send_wakeup);
    728         srpl_connection->keepalive_send_wakeup = NULL;
    729     }
    730     if (srpl_connection->keepalive_receive_wakeup != NULL) {
    731         ioloop_cancel_wake_event(srpl_connection->keepalive_receive_wakeup);
    732         ioloop_wakeup_release(srpl_connection->keepalive_receive_wakeup);
    733         srpl_connection->keepalive_receive_wakeup = NULL;
    734     }
    735     srpl_host_update_parts_free(&srpl_connection->stashed_host);
    736     free(srpl_connection->name);
    737     srpl_connection_reset(srpl_connection);
    738     free(srpl_connection);
    739 }
    740 
    741 void
    742 srpl_connection_release_(srpl_connection_t *srpl_connection, const char *file, int line)
    743 {
    744     RELEASE(srpl_connection, srpl_connection);
    745 }
    746 
    747 void
    748 srpl_connection_retain_(srpl_connection_t *srpl_connection, const char *file, int line)
    749 {
    750     RETAIN(srpl_connection, srpl_connection);
    751 }
    752 
    753 srpl_connection_t *
    754 srpl_connection_create(srpl_instance_t *instance, bool outgoing)
    755 {
    756     srpl_connection_t *srpl_connection = calloc(1, sizeof (*srpl_connection)), *ret = NULL;
    757     if (srpl_connection == NULL) {
    758         goto out;
    759     }
    760     RETAIN_HERE(srpl_connection, srpl_connection);
    761 #define POINTER_TO_HEX_MAX_STRLEN 19 // 0x<...>
    762     size_t srpl_connection_name_length = strlen(instance->instance_name) + 2 + POINTER_TO_HEX_MAX_STRLEN + 3;
    763     srpl_connection->name = malloc(srpl_connection_name_length);
    764     if (srpl_connection->name == NULL) {
    765         goto out;
    766     }
    767     srpl_connection->keepalive_send_wakeup = ioloop_wakeup_create();
    768     if (srpl_connection->keepalive_send_wakeup == NULL) {
    769         goto out;
    770     }
    771     srpl_connection->keepalive_receive_wakeup = ioloop_wakeup_create();
    772     if (srpl_connection->keepalive_receive_wakeup == NULL) {
    773         goto out;
    774     }
    775     srpl_connection->keepalive_interval = DEFAULT_KEEPALIVE_WAKEUP_EXPIRY / 2;
    776     snprintf(srpl_connection->name, srpl_connection_name_length, "%s%s (%p)", outgoing ? ">" : "<", instance->instance_name, srpl_connection);
    777     srpl_connection->is_server = !outgoing;
    778     srpl_connection->instance = instance;
    779     RETAIN_HERE(instance, srpl_instance);
    780 #ifdef SRP_TEST_SERVER
    781     srpl_connection->state = srpl_state_test_event_intercept;
    782 #endif
    783     ret = srpl_connection;
    784     srpl_connection = NULL;
    785 out:
    786     if (srpl_connection != NULL) {
    787         RELEASE_HERE(srpl_connection, srpl_connection);
    788     }
    789     return ret;
    790 }
    791 
    792 static void
    793 srpl_connection_context_release(void *context)
    794 {
    795     srpl_connection_t *srpl_connection = context;
    796 
    797     RELEASE_HERE(srpl_connection, srpl_connection);
    798 }
    799 
    800 static void
    801 srpl_instance_service_context_release(void *context)
    802 {
    803     srpl_instance_service_t *service = context;
    804 
    805     RELEASE_HERE(service, srpl_instance_service);
    806 }
    807 
    808 static void
    809 srpl_instance_context_release(void *context)
    810 {
    811     srpl_instance_t *instance = context;
    812 
    813     RELEASE_HERE(instance, srpl_instance);
    814 }
    815 
    816 static void
    817 srpl_instance_discontinue_timeout(void *context)
    818 {
    819     srpl_instance_t **sp = NULL, *instance = context;
    820     srpl_domain_t *domain = instance->domain;
    821 
    822     INFO("discontinuing instance " PRI_S_SRP " with partner id %" PRIx64, instance->instance_name, instance->partner_id);
    823     for (sp = &domain->instances; *sp; sp = &(*sp)->next) {
    824         if (*sp == instance) {
    825             *sp = instance->next;
    826             break;
    827         }
    828     }
    829 
    830     srpl_connection_t *srpl_connection = instance->connection;
    831     if (srpl_connection != NULL) {
    832         RELEASE_HERE(srpl_connection->instance, srpl_instance);
    833         srpl_connection->instance = NULL;
    834         srpl_connection_discontinue(srpl_connection);
    835         // The instance no longer has a reference to the srpl_connection object.
    836         RELEASE_HERE(srpl_connection, srpl_connection);
    837         instance->connection = NULL;
    838     }
    839     RELEASE_HERE(instance, srpl_instance);
    840 
    841     // Check to see if we are eligible to move into the routine state if we haven't done so.
    842     // If the partner we failed to sync with goes away, we could enter the routine state if
    843     // we have succcessfully sync-ed with all other partners discovered in startup.
    844     if (domain->srpl_opstate != SRPL_OPSTATE_ROUTINE) {
    845         srpl_maybe_sync_or_transition(domain);
    846     }
    847 }
    848 
    849 static void
    850 srpl_instance_service_discontinue_timeout(void *context)
    851 {
    852     srpl_instance_service_t **hp = NULL, *service = context;
    853     srpl_domain_t *domain = service->domain;
    854     srpl_instance_t *instance = service->instance;
    855 
    856     // Retain for duration of function, since otherwise we might finalize it below.
    857 
    858     // This retain shouldn't be necessary if we are actually being called by the timeout, because the timeout holds a
    859     // reference to service that can't be released below. However, this function can be called directly, outside of a
    860     // timeout, and in that case we do need to retain service for the function.
    861 
    862     RETAIN_HERE(service, srpl_instance_service);
    863 
    864     // Remove the service from either the unresolved_services list or resolved instance list
    865     if (instance == NULL) {
    866         hp = &domain->unresolved_services;
    867     } else {
    868         RETAIN_HERE(instance, srpl_instance); // Retain instance for life of function in case we decrement its refcnt below.
    869         hp = &instance->services;
    870     }
    871     for (; *hp; hp = &(*hp)->next) {
    872         if (*hp == service) {
    873             *hp = service->next;
    874             RELEASE_HERE(service, srpl_instance_service); // Release service list's reference to instance_service.
    875             break;
    876         }
    877     }
    878 
    879     if (service->discontinue_timeout != NULL) {
    880         ioloop_cancel_wake_event(service->discontinue_timeout);
    881         ioloop_wakeup_release(service->discontinue_timeout);
    882         service->discontinue_timeout = NULL;
    883     }
    884     if (service->resolve_wakeup != NULL) {
    885         ioloop_cancel_wake_event(service->resolve_wakeup);
    886         ioloop_wakeup_release(service->resolve_wakeup);
    887         service->resolve_wakeup = NULL;
    888     }
    889     if (service->address_query != NULL) {
    890         address_query_cancel(service->address_query);
    891         RELEASE_HERE(service->address_query, address_query);
    892         service->address_query = NULL;
    893     }
    894     if (service->txt_txn != NULL) {
    895         ioloop_dnssd_txn_cancel(service->txt_txn);
    896         ioloop_dnssd_txn_release(service->txt_txn);
    897         service->txt_txn = NULL;
    898         service->resolve_started = false;
    899     }
    900     if (service->srv_txn != NULL) {
    901         ioloop_dnssd_txn_cancel(service->srv_txn);
    902         ioloop_dnssd_txn_release(service->srv_txn);
    903         service->srv_txn = NULL;
    904         service->resolve_started = false;
    905     }
    906     if (service->instance != NULL) {
    907         RELEASE_HERE(service->instance, srpl_instance);
    908         service->instance = NULL;
    909     }
    910     if (instance != NULL) {
    911         if (instance->services == NULL) {
    912             srpl_instance_discontinue_timeout(instance);
    913         }
    914         RELEASE_HERE(instance, srpl_instance); // Release this function's reference to instance
    915     }
    916     RELEASE_HERE(service, srpl_instance_service); // Release this functions reference to instance_service.
    917 }
    918 
    919 static void
    920 srpl_instance_services_discontinue(srpl_instance_t *instance)
    921 {
    922     srpl_instance_service_t *service;
    923     for (service = instance->services; service != NULL; ) {
    924         // The service is retained on the list, but...
    925         srpl_instance_service_t *next = service->next;
    926         // This is going to release it...
    927         srpl_instance_service_discontinue_timeout(service);
    928         // So next is still valid here, but service isn't.
    929         service = next;
    930     }
    931 }
    932 
    933 static void
    934 srpl_instance_service_discontinue(srpl_instance_service_t *service)
    935 {
    936     // Already discontinuing.
    937     if (service->discontinuing) {
    938         INFO("Replication service " PRI_S_SRP " went away, already discontinuing", service->full_service_name);
    939         return;
    940     }
    941     if (service->num_copies > 0) {
    942         INFO("Replication service " PRI_S_SRP " went away, %d still left", service->host_name, service->num_copies);
    943         return;
    944     }
    945     INFO("Replication service " PRI_S_SRP " went away, none left, discontinuing", service->full_service_name);
    946     service->discontinuing = true;
    947 
    948     // DNSServiceResolve doesn't give us the kDNSServiceFlagAdd flag--apparently it's assumed that we know the
    949     // service was removed because we get a remove on the browse. So we need to restart the resolve if the
    950     // instance comes back, rather than continuing to use the old resolve transaction.
    951     if (service->txt_txn != NULL) {
    952         ioloop_dnssd_txn_cancel(service->txt_txn);
    953         ioloop_dnssd_txn_release(service->txt_txn);
    954         service->txt_txn = NULL;
    955     }
    956     if (service->srv_txn != NULL) {
    957         ioloop_dnssd_txn_cancel(service->srv_txn);
    958         ioloop_dnssd_txn_release(service->srv_txn);
    959         service->srv_txn = NULL;
    960     }
    961     service->resolve_started = false;
    962 
    963     // if all the services are discontinuing, we mark the instance to be discontinuing as well.
    964     // discontinuing instance will be exluded when we check if the server has sync-ed on all the
    965     // instances in order to move to the routine state and when we pick the winning dataset id
    966     // from the discovered instances (i.e., discontinuing instance no longer qualifies for
    967     // dataset_id election).
    968     srpl_instance_t *instance = service->instance;
    969     if (instance != NULL) {
    970         srpl_instance_service_t *sp;
    971         for(sp = instance->services; sp != NULL; sp = sp->next) {
    972             if (!sp->discontinuing) {
    973                 break;
    974             }
    975         }
    976         if (sp == NULL) {
    977             instance->discontinuing = true;
    978         }
    979     }
    980     // It's not uncommon for a name to drop and then come back immediately. Wait 30s before
    981     // discontinuing the instance host.
    982     if (service->discontinue_timeout == NULL) {
    983         service->discontinue_timeout = ioloop_wakeup_create();
    984         // Oh well.
    985         if (service->discontinue_timeout == NULL) {
    986             srpl_instance_service_discontinue_timeout(service);
    987             return;
    988         }
    989     }
    990 
    991     RETAIN_HERE(service, srpl_instance_service);
    992     ioloop_add_wake_event(service->discontinue_timeout, service, srpl_instance_service_discontinue_timeout,
    993                           srpl_instance_service_context_release, 30 * 1000);
    994 }
    995 
    996 
    997 static void
    998 srpl_instance_discontinue(srpl_instance_t *instance)
    999 {
   1000     srpl_instance_service_t *service, *next;
   1001     instance->discontinuing = true;
   1002     for (service = instance->services; service != NULL; service = next) {
   1003         next = service->next;
   1004         service->num_copies = 0;
   1005         srpl_instance_service_discontinue(service);
   1006     }
   1007 }
   1008 
   1009 void
   1010 srpl_shutdown(srp_server_t *server_state)
   1011 {
   1012     srpl_instance_t *instance, *next;
   1013     srpl_instance_service_t *service, *next_service;
   1014 
   1015     if (server_state->current_thread_domain_name == NULL) {
   1016         INFO("no current domain");
   1017         return;
   1018     }
   1019     for (srpl_domain_t **dp = &server_state->srpl_domains; *dp != NULL; ) {
   1020         srpl_domain_t *domain = *dp;
   1021         if (!strcmp(domain->name, server_state->current_thread_domain_name)) {
   1022             for (instance = domain->instances; instance != NULL; instance = next) {
   1023                 next = instance->next;
   1024                 srpl_instance_services_discontinue(instance);
   1025             }
   1026             for (service = domain->unresolved_services; service != NULL; service = next_service) {
   1027                 next_service = service->next;
   1028                 srpl_instance_service_discontinue_timeout(service);
   1029             }
   1030             if (domain->query != NULL) {
   1031                 ioloop_dnssd_txn_cancel(domain->query);
   1032                 ioloop_dnssd_txn_release(domain->query);
   1033                 domain->query = NULL;
   1034             }
   1035             if (domain->srpl_advertise_txn != NULL) {
   1036                 ioloop_dnssd_txn_cancel(domain->srpl_advertise_txn);
   1037                 ioloop_dnssd_txn_release(domain->srpl_advertise_txn);
   1038                 domain->srpl_advertise_txn = NULL;
   1039             }
   1040             if (domain->partner_discovery_timeout != NULL) {
   1041                 ioloop_cancel_wake_event(domain->partner_discovery_timeout);
   1042                 ioloop_wakeup_release(domain->partner_discovery_timeout);
   1043                 domain->partner_discovery_timeout = NULL;
   1044             }
   1045             *dp = domain->next;
   1046             RELEASE_HERE(domain, srpl_domain);
   1047             free(server_state->current_thread_domain_name);
   1048             server_state->current_thread_domain_name = NULL;
   1049         } else {
   1050             dp = &(*dp)->next;
   1051         }
   1052     }
   1053 }
   1054 
   1055 void
   1056 srpl_disable(srp_server_t *server_state)
   1057 {
   1058     srpl_shutdown(server_state);
   1059     server_state->srp_replication_enabled = false;
   1060 }
   1061 
   1062 void
   1063 srpl_drop_srpl_connection(srp_server_t *NONNULL server_state)
   1064 {
   1065     for (srpl_domain_t *domain = server_state->srpl_domains; domain != NULL; domain = domain->next) {
   1066         for (srpl_instance_t *instance = domain->instances; instance != NULL; instance = instance->next) {
   1067             if (instance->connection != NULL && instance->connection->state > srpl_state_disconnect_wait) {
   1068                 srpl_connection_discontinue(instance->connection);
   1069             }
   1070         }
   1071     }
   1072 }
   1073 
   1074 void
   1075 srpl_undrop_srpl_connection(srp_server_t *NONNULL server_state)
   1076 {
   1077     for (srpl_domain_t *domain = server_state->srpl_domains; domain != NULL; domain = domain->next) {
   1078         for (srpl_instance_t *instance = domain->instances; instance != NULL; instance = instance->next) {
   1079             srpl_instance_reconnect(instance);
   1080         }
   1081     }
   1082 }
   1083 
   1084 // Stop service advertisement in the given domain.
   1085 static void
   1086 srpl_stop_domain_advertisement(srpl_domain_t *NONNULL domain)
   1087 {
   1088     INFO("dropping advertisement for domain " PUB_S_SRP, domain->name);
   1089     if (domain->srpl_advertise_txn != NULL) {
   1090         ioloop_dnssd_txn_cancel(domain->srpl_advertise_txn);
   1091         ioloop_dnssd_txn_release(domain->srpl_advertise_txn);
   1092         domain->srpl_advertise_txn = NULL;
   1093     }
   1094 }
   1095 
   1096 // Stop service advertisement in all the domains
   1097 void
   1098 srpl_drop_srpl_advertisement(srp_server_t *NONNULL server_state)
   1099 {
   1100     srpl_domain_t *domain;
   1101     for (domain = server_state->srpl_domains; domain != NULL; domain = domain->next) {
   1102         srpl_stop_domain_advertisement(domain);
   1103     }
   1104 }
   1105 
   1106 void
   1107 srpl_undrop_srpl_advertisement(srp_server_t *NONNULL server_state)
   1108 {
   1109     srpl_domain_t *domain;
   1110     for (domain = server_state->srpl_domains; domain != NULL; domain = domain->next) {
   1111         srpl_domain_advertise(domain);
   1112     }
   1113 }
   1114 
   1115 
   1116 // Copy from into to, and then NULL out the host pointer in from, which is not refcounted, so that we don't get a
   1117 // double free later. Add a reference to the message, since it is refcounted.
   1118 static void
   1119 srpl_host_update_steal_parts(srpl_host_update_t *to, srpl_host_update_t *from)
   1120 {
   1121     srpl_host_update_parts_free(to);
   1122     *to = *from;
   1123     from->hostname = NULL;
   1124     from->messages = NULL;
   1125     from->num_messages = from->max_messages = from->messages_processed = 0;
   1126 }
   1127 
   1128 static bool
   1129 srpl_event_content_type_set_(srpl_event_t *event, srpl_event_content_type_t content_type, const char *file, int line)
   1130 {
   1131     switch(event->content_type) {
   1132     case srpl_event_content_type_none:
   1133     case srpl_event_content_type_address:
   1134     case srpl_event_content_type_session:
   1135     case srpl_event_content_type_candidate_disposition:
   1136     case srpl_event_content_type_rcode:
   1137     case srpl_event_content_type_client_result: // pointers owned by caller
   1138     case srpl_event_content_type_advertise_finished_result:
   1139         break;
   1140 
   1141     case srpl_event_content_type_candidate:
   1142         if (event->content.candidate != NULL) {
   1143             srpl_candidate_free_(event->content.candidate, file, line);
   1144             event->content.candidate = NULL;
   1145         }
   1146         break;
   1147     case srpl_event_content_type_host_update:
   1148         srpl_host_update_parts_free(&event->content.host_update);
   1149         break;
   1150     }
   1151     memset(&event->content, 0, sizeof(event->content));
   1152     if (content_type == srpl_event_content_type_candidate) {
   1153         event->content.candidate = calloc(1, sizeof(srpl_candidate_t));
   1154         if (event->content.candidate == NULL) {
   1155             return false;
   1156         }
   1157     }
   1158     event->content_type = content_type;
   1159     return true;
   1160 }
   1161 
   1162 static void
   1163 srpl_disconnected_callback(comm_t *comm, void *context, int UNUSED error)
   1164 {
   1165     srpl_connection_t *srpl_connection = context;
   1166     srpl_domain_t *domain;
   1167 
   1168     // No matter what state we are in, if we are disconnected, we can't continue with the existing connection.
   1169     // Either we need to make a new connection, or go idle.
   1170 
   1171     srpl_instance_t *instance = srpl_connection->instance;
   1172 
   1173     // The connection would still be holding a reference; hold a reference to the connection to avoid it being released
   1174     // prematurely.
   1175     RETAIN_HERE(srpl_connection, srpl_connection);
   1176 
   1177     // Get rid of the comm_t connection object if it's still around
   1178     if (srpl_connection->connection != NULL && srpl_connection->connection == comm) {
   1179         comm_t *connection = srpl_connection->connection;
   1180         srpl_connection->connection = NULL;
   1181         gettimeofday(&srpl_connection->connection_null_time, NULL);
   1182         srpl_connection->connection_null_reason = "disconnected_callback";
   1183         ioloop_comm_release(connection);
   1184 
   1185         if (srpl_connection->dso != NULL) {
   1186             dso_state_cancel(srpl_connection->dso);
   1187             srpl_connection->dso = NULL;
   1188         }
   1189     }
   1190 
   1191     // If there's no instance, this connection just needs to go away (and presumably has).
   1192     if (instance == NULL) {
   1193         INFO("the instance is NULL.");
   1194         goto out;
   1195     }
   1196 
   1197     // Because instance is still holding a reference to srpl_connection, it's safe to keep using srpl_connection.
   1198 
   1199     // Clear old data from connection.
   1200     srpl_connection_reset(srpl_connection);
   1201 
   1202     domain = instance->domain;
   1203     if (domain == NULL) {
   1204         // If domain is NULL, instance has been discontinued.
   1205         INFO(PRI_S_SRP "instance was discontinued, not reconnecting.", instance->instance_name);
   1206     } else {
   1207         // If we are in the startup state, we should reinitiate the connection to the peer.
   1208         // Otherwise, we should reconnect only if our partner id is greater than the peer's.
   1209         // If there's no partner id on the instance, the instance should be a temporary one
   1210         // and that means we haven't discovered the peer yet, so we can just drop the connection
   1211         // and wait to discover it or for it to reconnect.
   1212         if (domain->srpl_opstate == SRPL_OPSTATE_STARTUP ||
   1213             (instance->have_partner_id && domain->server_state != NULL &&
   1214             domain->partner_id > instance->partner_id))
   1215         {
   1216             // cancel reconnect_timeout if there's one scheduled.
   1217             if (instance->reconnect_timeout != NULL) {
   1218                 ioloop_cancel_wake_event(instance->reconnect_timeout);
   1219             }
   1220             INFO(PRI_S_SRP ": disconnect received, reconnecting.", srpl_connection->name);
   1221             srpl_connection_next_state(srpl_connection, srpl_state_next_address_get);
   1222             goto out;
   1223         }
   1224     }
   1225 
   1226     // If the connection is in the disconnect_wait state, deliver an event.
   1227     if (srpl_connection->state == srpl_state_disconnect_wait) {
   1228         srpl_event_t event;
   1229         srpl_event_initialize(&event, srpl_event_disconnected);
   1230         srpl_event_deliver(srpl_connection, &event);
   1231         goto out;
   1232     }
   1233 
   1234     // If it's not our job to reconnect, we no longer need this connection. Release the reference
   1235     // held by the instance (which'd cause the connection to be finalized).
   1236     srpl_connection_next_state(srpl_connection, srpl_state_idle);
   1237     if (instance->connection == srpl_connection) {
   1238         RELEASE_HERE(srpl_connection, srpl_connection);
   1239         instance->connection = NULL;
   1240     }
   1241 
   1242 out:
   1243     RELEASE_HERE(srpl_connection, srpl_connection);
   1244 }
   1245 
   1246 static bool
   1247 srpl_dso_message_setup(dso_state_t *dso, dso_message_t *state, dns_towire_state_t *towire, uint8_t *buffer,
   1248                        size_t buffer_size, message_t *message, bool unidirectional, bool response, int rcode,
   1249                        uint16_t xid, srpl_connection_t *srpl_connection)
   1250 {
   1251     uint16_t send_xid = 0;
   1252 
   1253     if (srpl_connection->connection == NULL) {
   1254         struct tm tm;
   1255         localtime_r(&srpl_connection->connection_null_time.tv_sec, &tm);
   1256         char tmoff = tm.tm_gmtoff > 0 ? '+' : '-';
   1257         long tzoff = tm.tm_gmtoff > 0 ? tm.tm_gmtoff : -tm.tm_gmtoff;
   1258         FAULT("sending a message on a nonexistent connection: " PUB_S_SRP " (%04d-%02d-%02d %02d:%02d:%02d.%06d%c%02ld%02ld)!",
   1259               srpl_connection->connection_null_reason == NULL
   1260               ? "no connection ever set" : srpl_connection->connection_null_reason,
   1261               tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec,
   1262               srpl_connection->connection_null_time.tv_usec, tmoff, tzoff / 3600, (tzoff / 60) % 60);
   1263         return false;
   1264     }
   1265 
   1266     if (buffer_size < DNS_HEADER_SIZE) {
   1267         ERROR("internal: invalid buffer size %zd", buffer_size);
   1268         return false;
   1269     }
   1270 
   1271     if (response) {
   1272         if (message != NULL) {
   1273             send_xid = message->wire.id;
   1274         } else {
   1275             send_xid = xid;
   1276         }
   1277     }
   1278     dso_make_message(state, buffer, buffer_size, dso, unidirectional, response,
   1279                      send_xid, rcode, srpl_connection);
   1280     memset(towire, 0, sizeof(*towire));
   1281     towire->p = &buffer[DNS_HEADER_SIZE];
   1282     towire->lim = towire->p + (buffer_size - DNS_HEADER_SIZE);
   1283     towire->message = (dns_wire_t *)buffer;
   1284     return true;
   1285 }
   1286 
   1287 static srpl_domain_t *
   1288 srpl_connection_domain(srpl_connection_t *srpl_connection)
   1289 {
   1290     if (srpl_connection->instance == NULL) {
   1291         INFO("connection " PRI_S_SRP " (%p) has no instance.", srpl_connection->name, srpl_connection);
   1292         return NULL;
   1293     }
   1294     return srpl_connection->instance->domain;
   1295 }
   1296 
   1297 static bool
   1298 srpl_keepalive_send(srpl_connection_t *srpl_connection, bool response, uint16_t xid)
   1299 {
   1300     uint8_t dsobuf[SRPL_KEEPALIVE_MESSAGE_LENGTH];
   1301     dns_towire_state_t towire;
   1302     dso_message_t message;
   1303     struct iovec iov;
   1304 
   1305     if (!srpl_dso_message_setup(srpl_connection->dso, &message, &towire, dsobuf, sizeof(dsobuf),
   1306                                 NULL, srpl_connection->dso->is_server, response,
   1307                                 dns_rcode_noerror, xid, srpl_connection)) {
   1308         return false;
   1309     }
   1310     dns_u16_to_wire(&towire, kDSOType_Keepalive);
   1311     dns_rdlength_begin(&towire);
   1312     dns_u32_to_wire(&towire, DEFAULT_KEEPALIVE_WAKEUP_EXPIRY / 2); // Idle timeout (we are never idle)
   1313     dns_u32_to_wire(&towire, DEFAULT_KEEPALIVE_WAKEUP_EXPIRY / 2); // Keepalive timeout
   1314     dns_rdlength_end(&towire);
   1315     if (towire.error) {
   1316         ERROR("ran out of message space at " PUB_S_SRP ", :%d", __FILE__, towire.line);
   1317         return false;
   1318     }
   1319     memset(&iov, 0, sizeof(iov));
   1320     iov.iov_len = towire.p - dsobuf;
   1321     iov.iov_base = dsobuf;
   1322     if (!ioloop_send_message(srpl_connection->connection, srpl_connection_message_get(srpl_connection), &iov, 1)) {
   1323         INFO("send failed");
   1324         srpl_disconnect(srpl_connection);
   1325         return false;
   1326     }
   1327 
   1328     INFO("sent %zd byte " PUB_S_SRP " Keepalive, xid %02x%02x (was %04x), to " PRI_S_SRP, iov.iov_len,
   1329          (response ? "response" : (srpl_connection->is_server
   1330                                    ? "unidirectional"
   1331                                    : "query")), dsobuf[0], dsobuf[1], xid, srpl_connection->name);
   1332     srpl_message_sent(srpl_connection);
   1333     return true;
   1334 }
   1335 
   1336 // If we ever get a wakeup, it means that a wakeup send interval has passed since the last time we sent any message on
   1337 // this connection, so we should send a keepalive message.
   1338 static void
   1339 srpl_connection_keepalive_send_wakeup(void *context)
   1340 {
   1341     srpl_connection_t *srpl_connection = context;
   1342 
   1343     // In case we lost our connection but still have keepalive timers going, now's a good time to
   1344     // cancel them.
   1345     if (srpl_connection->connection == NULL) {
   1346         // Cancel keepalive timers
   1347         if (srpl_connection->keepalive_send_wakeup) {
   1348             ioloop_cancel_wake_event(srpl_connection->keepalive_send_wakeup);
   1349         }
   1350         if (srpl_connection->keepalive_receive_wakeup) {
   1351             ioloop_cancel_wake_event(srpl_connection->keepalive_receive_wakeup);
   1352         }
   1353         return;
   1354     }
   1355     srpl_keepalive_send(srpl_connection, false, 0);
   1356     srpl_message_sent(srpl_connection);
   1357 }
   1358 
   1359 static void
   1360 srpl_message_sent(srpl_connection_t *srpl_connection)
   1361 {
   1362     if (!srpl_connection->is_server) {
   1363         srpl_connection->last_message_sent = srp_time();
   1364         ioloop_add_wake_event(srpl_connection->keepalive_send_wakeup,
   1365                               srpl_connection, srpl_connection_keepalive_send_wakeup, srpl_connection_context_release,
   1366                               srpl_connection->keepalive_interval);
   1367         RETAIN_HERE(srpl_connection, srpl_connection); // for the callback
   1368     }
   1369 }
   1370 
   1371 static bool
   1372 srpl_session_message_send(srpl_connection_t *srpl_connection, bool response)
   1373 {
   1374     uint8_t dsobuf[SRPL_SESSION_MESSAGE_LENGTH];
   1375     dns_towire_state_t towire;
   1376     dso_message_t message;
   1377     struct iovec iov;
   1378     srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   1379     if (domain == NULL) {
   1380         return false;
   1381     }
   1382 #ifdef TEST_DSO_MESSAGE_SETUP_CONNECTION_NULL_FAULT
   1383     comm_t *connection = srpl_connection->connection;
   1384     const char *old_null_reason = srpl_connection->connection_null_reason;
   1385     struct timeval old_null_time = srpl_connection->connection_null_time;
   1386     srpl_connection->connection = NULL;
   1387     srpl_connection->connection_null_reason = "unit test";
   1388     gettimeofday(&srpl_connection->connection_null_time, NULL);
   1389     bool ret = srpl_dso_message_setup(srpl_connection->dso, &message, &towire, dsobuf, sizeof(dsobuf),
   1390                                       srpl_connection_message_get(srpl_connection), false, response, 0, 0, srpl_connection);
   1391     if (ret != false) {
   1392         FAULT("failed to detect null connection!");
   1393     }
   1394     srpl_connection->connection = connection;
   1395     srpl_connection->connection_null_reason = old_null_reason;
   1396     srpl_connection->connection_null_time = old_null_time;
   1397 #endif
   1398 
   1399     if (!srpl_dso_message_setup(srpl_connection->dso, &message, &towire, dsobuf, sizeof(dsobuf),
   1400                                 srpl_connection_message_get(srpl_connection), false, response, 0, 0, srpl_connection)) {
   1401         return false;
   1402     }
   1403     dns_u16_to_wire(&towire, kDSOType_SRPLSession);
   1404     dns_rdlength_begin(&towire);
   1405     dns_u64_to_wire(&towire, domain->partner_id);
   1406     dns_rdlength_end(&towire);
   1407 
   1408     // version TLV
   1409     dns_u16_to_wire(&towire, kDSOType_SRPLVersion);
   1410     dns_rdlength_begin(&towire);
   1411     dns_u16_to_wire(&towire, SRPL_CURRENT_VERSION);
   1412     dns_rdlength_end(&towire);
   1413 
   1414     // domain name tlv
   1415     dns_u16_to_wire(&towire, kDSOType_SRPLDomainName);
   1416     dns_rdlength_begin(&towire);
   1417     INFO("include domain " PRI_S_SRP, domain->name);
   1418     dns_full_name_to_wire(NULL, &towire, domain->name);
   1419     dns_rdlength_end(&towire);
   1420     if (domain->srpl_opstate == SRPL_OPSTATE_STARTUP) {
   1421         // new partner TLV
   1422         dns_u16_to_wire(&towire, kDSOType_SRPLNewPartner);
   1423         dns_rdlength_begin(&towire);
   1424         dns_rdlength_end(&towire);
   1425     }
   1426 
   1427     if (towire.error) {
   1428         ERROR("ran out of message space at " PUB_S_SRP ", :%d", __FILE__, towire.line);
   1429         return false;
   1430     }
   1431     memset(&iov, 0, sizeof(iov));
   1432     iov.iov_len = towire.p - dsobuf;
   1433     iov.iov_base = dsobuf;
   1434     if (!ioloop_send_message(srpl_connection->connection, srpl_connection_message_get(srpl_connection), &iov, 1)) {
   1435         INFO("send failed");
   1436         srpl_disconnect(srpl_connection);
   1437         return false;
   1438     }
   1439 
   1440     INFO(PRI_S_SRP " sent SRPLSession " PUB_S_SRP ", id %" PRIx64, srpl_connection->name,
   1441          response ? "response" : "message", domain->partner_id);
   1442     srpl_message_sent(srpl_connection);
   1443     return true;
   1444 }
   1445 
   1446 static bool
   1447 srpl_send_candidates_message_send(srpl_connection_t *srpl_connection, bool response)
   1448 {
   1449     uint8_t dsobuf[SRPL_SEND_CANDIDATES_LENGTH];
   1450     dns_towire_state_t towire;
   1451     dso_message_t message;
   1452     struct iovec iov;
   1453 
   1454     if (!srpl_dso_message_setup(srpl_connection->dso, &message, &towire, dsobuf, sizeof(dsobuf),
   1455                                 srpl_connection_message_get(srpl_connection), false, response, 0, 0, srpl_connection)) {
   1456         return false;
   1457     }
   1458     dns_u16_to_wire(&towire, kDSOType_SRPLSendCandidates);
   1459     dns_rdlength_begin(&towire);
   1460     dns_rdlength_end(&towire);
   1461     if (towire.error) {
   1462         ERROR("ran out of message space at " PUB_S_SRP ", :%d", __FILE__, towire.line);
   1463         return false;
   1464     }
   1465     memset(&iov, 0, sizeof(iov));
   1466     iov.iov_len = towire.p - dsobuf;
   1467     iov.iov_base = dsobuf;
   1468     if (!ioloop_send_message(srpl_connection->connection, srpl_connection_message_get(srpl_connection), &iov, 1)) {
   1469         INFO("send failed");
   1470         srpl_disconnect(srpl_connection);
   1471         return false;
   1472     }
   1473 
   1474     INFO(PRI_S_SRP " sent SRPLSendCandidates " PUB_S_SRP, srpl_connection->name, response ? "response" : "query");
   1475     srpl_message_sent(srpl_connection);
   1476     return true;
   1477 }
   1478 
   1479 static bool
   1480 srpl_candidate_message_send(srpl_connection_t *srpl_connection, adv_host_t *host)
   1481 {
   1482     uint8_t dsobuf[SRPL_CANDIDATE_MESSAGE_LENGTH];
   1483     dns_towire_state_t towire;
   1484     dso_message_t message;
   1485     struct iovec iov;
   1486     time_t update_time = host->update_time;
   1487 
   1488     if (!srpl_dso_message_setup(srpl_connection->dso, &message, &towire,
   1489                                 dsobuf, sizeof(dsobuf), NULL, false, false, 0, 0, srpl_connection)) {
   1490         return false;
   1491     }
   1492 
   1493     // For testing, make the update time really wrong so that signature validation fails. This will not actually
   1494     // cause a failure unless the SRP requestor sends a time range, so really only useful for testing with the
   1495     // mDNSResponder srp-client, not with e.g. a Thread client.
   1496     if (host->server_state != NULL && host->server_state->break_srpl_time) {
   1497         INFO("breaking time: %lu -> %lu", (unsigned long)update_time, (unsigned long)(update_time - 1800));
   1498         update_time -= 1800;
   1499     }
   1500     dns_u16_to_wire(&towire, kDSOType_SRPLCandidate);
   1501     dns_rdlength_begin(&towire);
   1502     dns_rdlength_end(&towire);
   1503     dns_u16_to_wire(&towire, kDSOType_SRPLHostname);
   1504     dns_rdlength_begin(&towire);
   1505     dns_full_name_to_wire(NULL, &towire, host->name);
   1506     dns_rdlength_end(&towire);
   1507     dns_u16_to_wire(&towire, kDSOType_SRPLTimeOffset);
   1508     dns_rdlength_begin(&towire);
   1509     dns_u32_to_wire(&towire, (uint32_t)(srp_time() - update_time));
   1510     dns_rdlength_end(&towire);
   1511     dns_u16_to_wire(&towire, kDSOType_SRPLKeyID);
   1512     dns_rdlength_begin(&towire);
   1513     dns_u32_to_wire(&towire, host->key_id);
   1514     dns_rdlength_end(&towire);
   1515     if (towire.error) {
   1516         ERROR("ran out of message space at " PUB_S_SRP ", :%d", __FILE__, towire.line);
   1517         return false;
   1518     }
   1519     memset(&iov, 0, sizeof(iov));
   1520     iov.iov_len = towire.p - dsobuf;
   1521     iov.iov_base = dsobuf;
   1522     if (!ioloop_send_message(srpl_connection->connection, srpl_connection_message_get(srpl_connection), &iov, 1)) {
   1523         INFO("send failed");
   1524         srpl_disconnect(srpl_connection);
   1525         return false;
   1526     }
   1527 
   1528     INFO(PRI_S_SRP " sent SRPLCandidate message on connection.", srpl_connection->name);
   1529     srpl_message_sent(srpl_connection);
   1530     return true;
   1531 }
   1532 
   1533 static bool
   1534 srpl_candidate_response_send(srpl_connection_t *srpl_connection, dso_message_types_t response_type)
   1535 {
   1536     uint8_t dsobuf[SRPL_CANDIDATE_RESPONSE_LENGTH];
   1537     dns_towire_state_t towire;
   1538     dso_message_t message;
   1539     struct iovec iov;
   1540 
   1541     if (!srpl_dso_message_setup(srpl_connection->dso, &message, &towire, dsobuf, sizeof(dsobuf),
   1542                                 srpl_connection_message_get(srpl_connection), false, true, 0, 0, srpl_connection)) {
   1543         return false;
   1544     }
   1545     dns_u16_to_wire(&towire, kDSOType_SRPLCandidate);
   1546     dns_rdlength_begin(&towire);
   1547     dns_rdlength_end(&towire);
   1548     dns_u16_to_wire(&towire, response_type);
   1549     dns_rdlength_begin(&towire);
   1550     dns_rdlength_end(&towire);
   1551     if (towire.error) {
   1552         ERROR("ran out of message space at " PUB_S_SRP ", :%d", __FILE__, towire.line);
   1553         return false;
   1554     }
   1555     memset(&iov, 0, sizeof(iov));
   1556     iov.iov_len = towire.p - dsobuf;
   1557     iov.iov_base = dsobuf;
   1558     if (!ioloop_send_message(srpl_connection->connection, srpl_connection_message_get(srpl_connection), &iov, 1)) {
   1559         INFO("send failed");
   1560         srpl_disconnect(srpl_connection);
   1561         return false;
   1562     }
   1563 
   1564     INFO(PRI_S_SRP " sent SRPLCandidate response on connection.", srpl_connection->name);
   1565     srpl_message_sent(srpl_connection);
   1566     return true;
   1567 }
   1568 
   1569 // Qsort comparison function for message receipt times.
   1570 static int
   1571 srpl_message_compare(const void *v1, const void *v2)
   1572 {
   1573     const message_t *m1 = *(message_t**)v1;
   1574     const message_t *m2 = *(message_t**)v2;
   1575     if (m1->received_time - m2->received_time < 0) {
   1576         return -1;
   1577     } else if(m1->received_time - m2->received_time > 0) {
   1578         return 1;
   1579     } else {
   1580         return 0;
   1581     }
   1582 }
   1583 
   1584 static bool
   1585 srpl_host_message_send(srpl_connection_t *srpl_connection, adv_host_t *host)
   1586 {
   1587     uint8_t *dsobuf = NULL;
   1588     size_t dsobuf_length = SRPL_HOST_MESSAGE_LENGTH;
   1589     dns_towire_state_t towire;
   1590     dso_message_t message;
   1591     struct iovec *iov = NULL;
   1592     int num_messages; // Number of SRP updates we need to send
   1593     int iovec_count = 1, iov_cur = 0;
   1594     message_t **messages = NULL;
   1595     bool rv = false;
   1596 
   1597     if (host->message == NULL) {
   1598         FAULT("no host message to send for " PRI_S_SRP " on " PRI_S_SRP ".", host->name, srpl_connection->name);
   1599         goto out;
   1600     }
   1601     iovec_count++;
   1602     num_messages = 1;
   1603 
   1604     time_t srpl_now = srp_time();
   1605 
   1606     if (SRPL_SUPPORTS(srpl_connection, SRPL_VARIATION_MULTI_HOST_MESSAGE)) {
   1607         int num_instances = host->instances == NULL ? 0 : host->instances->num;
   1608         for (int i = 0; i < num_instances; i++) {
   1609             adv_instance_t *instance = host->instances->vec[i];
   1610             if (instance != NULL) {
   1611                 if (instance->message != NULL && instance->message != host->message && instance->message != NULL) {
   1612                     num_messages++;
   1613                 }
   1614             }
   1615         }
   1616         messages = calloc(num_messages, sizeof (*messages));
   1617         if (messages == NULL) {
   1618             INFO("no memory for message vector");
   1619             goto out;
   1620         }
   1621         messages[0] = host->message;
   1622         num_messages = 1;
   1623         for (int i = 0; i < num_instances; i++) {
   1624             adv_instance_t *instance = host->instances->vec[i];
   1625             if (instance != NULL) {
   1626                 if (instance->message != NULL && instance->message != host->message && instance->message != NULL) {
   1627                     messages[num_messages] = instance->message;
   1628                     num_messages++;
   1629                 }
   1630             }
   1631         }
   1632         qsort(messages, num_messages, sizeof(*messages), srpl_message_compare);
   1633         // eliminate the duplicate messages in the sorted array.
   1634         int nondup_pos = 0;
   1635         INFO("messages[0] = %p, received_time = %ld", messages[0], srpl_now - messages[0]->received_time);
   1636         for (int i = 1; i < num_messages; i++) {
   1637             if (messages[i] != messages[nondup_pos]) {
   1638                 nondup_pos++;
   1639                 messages[nondup_pos] = messages[i];
   1640                 INFO("messages[%d] = messages[%d] (%p), received_time = %ld", nondup_pos, i, messages[i],
   1641                      srpl_now - messages[nondup_pos]->received_time);
   1642             }
   1643         }
   1644         num_messages = nondup_pos + 1;
   1645         // update iovec_count and dsobuf_length based on the number of messages.
   1646         // nondup_pos now is the position of the last unique message and it also
   1647         // is the number of extra host messages we have got in this SRPLHost message.
   1648         iovec_count += 2 * nondup_pos;
   1649         // Account for additional HostMessage TLV.
   1650         dsobuf_length += (DSO_TLV_HEADER_SIZE + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint32_t)) * nondup_pos;
   1651     }
   1652     iov = calloc(iovec_count, sizeof(*iov));
   1653     if (iov == NULL) {
   1654         ERROR("no memory for iovec.");
   1655         goto out;
   1656     }
   1657     dsobuf = malloc(dsobuf_length);
   1658     if (dsobuf == NULL) {
   1659         ERROR("no memory for dso buffer");
   1660         goto out;
   1661     }
   1662 
   1663     if (!srpl_dso_message_setup(srpl_connection->dso, &message, &towire,
   1664                                 dsobuf, dsobuf_length, NULL, false, false, 0, 0, srpl_connection)) {
   1665         goto out;
   1666     }
   1667 
   1668     // For testing, make the update time really wrong so that signature validation fails. This will not actually
   1669     // cause a failure unless the SRP requestor sends a time range, so really only useful for testing with the
   1670     // mDNSResponder srp-client, not with e.g. a Thread client.
   1671     if (host->server_state != NULL && host->server_state->break_srpl_time) {
   1672         INFO("breaking time: %lu -> %lu", (unsigned long)srpl_now, (unsigned long)(srpl_now + 1800));
   1673         srpl_now += 1800;
   1674     }
   1675     dns_u16_to_wire(&towire, kDSOType_SRPLHost);
   1676     dns_rdlength_begin(&towire);
   1677     dns_rdlength_end(&towire);
   1678     dns_u16_to_wire(&towire, kDSOType_SRPLHostname);
   1679     dns_rdlength_begin(&towire);
   1680     dns_full_name_to_wire(NULL, &towire, host->name);
   1681     dns_rdlength_end(&towire);
   1682     // v0 of the protocol only includes one host message option, and timeoffset is sent
   1683     // as its own secondary TLV.
   1684     if (!SRPL_SUPPORTS(srpl_connection, SRPL_VARIATION_MULTI_HOST_MESSAGE)) {
   1685         dns_u16_to_wire(&towire, kDSOType_SRPLTimeOffset);
   1686         dns_rdlength_begin(&towire);
   1687         dns_u32_to_wire(&towire, (uint32_t)(srpl_now - host->message->received_time));
   1688         dns_rdlength_end(&towire);
   1689     }
   1690     dns_u16_to_wire(&towire, kDSOType_SRPLServerStableID);
   1691     dns_rdlength_begin(&towire);
   1692     dns_u64_to_wire(&towire, host->server_stable_id);
   1693     dns_rdlength_end(&towire);
   1694     if (!SRPL_SUPPORTS(srpl_connection, SRPL_VARIATION_MULTI_HOST_MESSAGE)) {
   1695         dns_u16_to_wire(&towire, kDSOType_SRPLHostMessage);
   1696         dns_u16_to_wire(&towire, host->message->length);
   1697         iov[iov_cur].iov_len = towire.p - dsobuf;
   1698         iov[iov_cur].iov_base = dsobuf;
   1699         iov_cur++;
   1700         iov[iov_cur].iov_len = host->message->length;
   1701         iov[iov_cur].iov_base = &host->message->wire;
   1702         iov_cur++;
   1703     } else {
   1704         uint8_t *start = dsobuf;
   1705         for (int i = 0; i < num_messages; i++) {
   1706             dns_u16_to_wire(&towire, kDSOType_SRPLHostMessage);
   1707             dns_u16_to_wire(&towire, 12 + messages[i]->length);
   1708             dns_u32_to_wire(&towire, messages[i]->lease);
   1709             dns_u32_to_wire(&towire, messages[i]->key_lease);
   1710             dns_u32_to_wire(&towire, (uint32_t)(srpl_now - messages[i]->received_time));
   1711             iov[iov_cur].iov_len = towire.p - start;
   1712             iov[iov_cur].iov_base = start;
   1713             iov_cur++;
   1714             iov[iov_cur].iov_len = messages[i]->length;
   1715             iov[iov_cur].iov_base = &messages[i]->wire;
   1716             iov_cur++;
   1717             start = towire.p;
   1718         }
   1719     }
   1720 
   1721     if (towire.error) {
   1722         ERROR("ran out of message space at " PUB_S_SRP ", :%d", __FILE__, towire.line);
   1723         goto out;
   1724     }
   1725 
   1726     if (!ioloop_send_message(srpl_connection->connection, srpl_connection_message_get(srpl_connection), iov, iov_cur)) {
   1727         INFO("send failed");
   1728         srpl_disconnect(srpl_connection);
   1729         return false;
   1730     }
   1731 
   1732     INFO(PRI_S_SRP " sent SRPLHost message %02x%02x " PRI_S_SRP " stable ID %" PRIx64 ", Host Message Count %d",
   1733          srpl_connection->name, message.buf[0], message.buf[1], host->name, host->server_stable_id, num_messages);
   1734     rv = true;
   1735     srpl_message_sent(srpl_connection);
   1736     out:
   1737     if (messages != NULL) {
   1738         free(messages);
   1739     }
   1740     if (iov != NULL) {
   1741         free(iov);
   1742     }
   1743     if (dsobuf != NULL) {
   1744         free(dsobuf);
   1745     }
   1746     return rv;
   1747 }
   1748 
   1749 
   1750 static bool
   1751 srpl_host_response_send(srpl_connection_t *srpl_connection, int rcode)
   1752 {
   1753     uint8_t dsobuf[SRPL_HOST_RESPONSE_LENGTH];
   1754     dns_towire_state_t towire;
   1755     dso_message_t message;
   1756     struct iovec iov;
   1757 
   1758     if (!srpl_dso_message_setup(srpl_connection->dso, &message, &towire, dsobuf, sizeof(dsobuf),
   1759                                 srpl_connection_message_get(srpl_connection), false, true, rcode, 0, srpl_connection)) {
   1760         return false;
   1761     }
   1762     dns_u16_to_wire(&towire, kDSOType_SRPLHost);
   1763     dns_rdlength_begin(&towire);
   1764     dns_rdlength_end(&towire);
   1765     if (towire.error) {
   1766         ERROR("ran out of message space at " PUB_S_SRP ", :%d", __FILE__, towire.line);
   1767         return false;
   1768     }
   1769     memset(&iov, 0, sizeof(iov));
   1770     iov.iov_len = towire.p - dsobuf;
   1771     iov.iov_base = dsobuf;
   1772     if (!ioloop_send_message(srpl_connection->connection, srpl_connection_message_get(srpl_connection), &iov, 1)) {
   1773         INFO("send failed");
   1774         srpl_disconnect(srpl_connection);
   1775         return false;
   1776     }
   1777     INFO(PRI_S_SRP " sent SRPLHost response %02x%02x rcode %d on connection.",
   1778          srpl_connection->name, message.buf[0], message.buf[1], rcode);
   1779     srpl_message_sent(srpl_connection);
   1780     return true;
   1781 }
   1782 
   1783 static bool
   1784 srpl_retry_delay_send(srpl_connection_t *srpl_connection, uint32_t delay)
   1785 {
   1786     uint8_t dsobuf[SRPL_RETRY_DELAY_LENGTH];
   1787     dns_towire_state_t towire;
   1788     dso_message_t message;
   1789     struct iovec iov;
   1790     srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   1791     if (domain == NULL) {
   1792         ERROR("domain is NULL.");
   1793         return false;
   1794     }
   1795 
   1796     // If this isn't a server, there's no benefit to sending retry delay.
   1797     if (!srpl_connection->is_server) {
   1798         return true;
   1799     }
   1800 
   1801     if (!srpl_dso_message_setup(srpl_connection->dso, &message, &towire, dsobuf, sizeof(dsobuf),
   1802                                 srpl_connection_message_get(srpl_connection), false, true, dns_rcode_noerror, 0,
   1803                                 srpl_connection))
   1804     {
   1805         return false;
   1806     }
   1807     dns_u16_to_wire(&towire, kDSOType_RetryDelay);
   1808     dns_rdlength_begin(&towire);
   1809     dns_u32_to_wire(&towire, delay); // One hour.
   1810     dns_rdlength_end(&towire);
   1811     if (towire.error) {
   1812         ERROR("ran out of message space at " PUB_S_SRP ", :%d", __FILE__, towire.line);
   1813         return false;
   1814     }
   1815     memset(&iov, 0, sizeof(iov));
   1816     iov.iov_len = towire.p - dsobuf;
   1817     iov.iov_base = dsobuf;
   1818     if (!ioloop_send_message(srpl_connection->connection, srpl_connection_message_get(srpl_connection), &iov, 1)) {
   1819         INFO("send failed");
   1820         srpl_disconnect(srpl_connection);
   1821         return false;
   1822     }
   1823 
   1824     INFO(PRI_S_SRP " sent Retry Delay, id %" PRIx64, srpl_connection->name, domain->partner_id);
   1825     srpl_message_sent(srpl_connection);
   1826     return true;
   1827 }
   1828 static bool
   1829 srpl_find_dso_additionals(srpl_connection_t *srpl_connection, dso_state_t *dso, const dso_message_types_t *additionals,
   1830                           bool *required, bool *multiple, const char **names, int *indices, int num,
   1831                           int min_additls, int max_additls, const char *message_name, void *context,
   1832                           bool (*iterator)(int index, const uint8_t *buf, unsigned *offp, uint16_t len, void *context))
   1833 {
   1834     int ret = true;
   1835     int count = 0;
   1836 
   1837     for (int j = 0; j < num; j++) {
   1838         indices[j] = -1;
   1839     }
   1840     for (unsigned i = 0; i < dso->num_additls; i++) {
   1841         bool found = false;
   1842         for (int j = 0; j < num; j++) {
   1843             if (dso->additl[i].opcode == additionals[j]) {
   1844                 if (indices[j] != -1 && (multiple == NULL || multiple[j] == false)) {
   1845                     ERROR(PRI_S_SRP ": duplicate " PUB_S_SRP " for " PUB_S_SRP ".",
   1846                           srpl_connection->name, names[j], message_name);
   1847                     ret = false;
   1848                     continue;
   1849                 }
   1850                 indices[j] = i;
   1851                 unsigned offp = 0;
   1852                 if (!iterator(j, dso->additl[i].payload, &offp, dso->additl[i].length, context) ||
   1853                     offp != dso->additl[i].length)
   1854                 {
   1855                     ERROR(PRI_S_SRP ": invalid " PUB_S_SRP " for " PUB_S_SRP ".",
   1856                           srpl_connection->name, names[j], message_name);
   1857                     found = true; // So we don't complain later.
   1858                     count++;
   1859                     ret = false;
   1860                 } else {
   1861                     found = true;
   1862                     count++;
   1863                 }
   1864             }
   1865         }
   1866         if (!found) {
   1867             ERROR(PRI_S_SRP ": unexpected opcode %x for " PUB_S_SRP ".",
   1868                   srpl_connection->name, dso->additl[i].opcode, message_name);
   1869         }
   1870     }
   1871     for (int j = 0; j < num; j++) {
   1872         if (required[j] && indices[j] == -1) {
   1873             ERROR(PRI_S_SRP ": missing " PUB_S_SRP " for " PUB_S_SRP ".",
   1874                   srpl_connection->name, names[j], message_name);
   1875             ret = false;
   1876         }
   1877     }
   1878     if (count < min_additls) {
   1879         ERROR(PRI_S_SRP ": not enough additional TLVs (%d < %d) for " PUB_S_SRP ".",
   1880               srpl_connection->name, count, min_additls, message_name);
   1881         ret = false;
   1882     } else if (count > max_additls) {
   1883         ERROR(PRI_S_SRP ": too many additional TLVs (%d > %d) for " PUB_S_SRP ".",
   1884               srpl_connection->name, count, max_additls, message_name);
   1885         ret = false;
   1886     }
   1887     return ret;
   1888 }
   1889 
   1890 static void
   1891 srpl_connection_discontinue(srpl_connection_t *srpl_connection)
   1892 {
   1893     srpl_connection->candidates_not_generated = true;
   1894     // Cancel any outstanding reconnect wakeup event, so that we don't accidentally restart the connection we decided to
   1895     // discontinue.
   1896     if (srpl_connection->reconnect_wakeup != NULL) {
   1897         ioloop_cancel_wake_event(srpl_connection->reconnect_wakeup);
   1898         // We have to get rid of the wakeup here because it's holding a reference to the connection, which we may want to
   1899         // have go away.
   1900         ioloop_wakeup_release(srpl_connection->reconnect_wakeup);
   1901         srpl_connection->reconnect_wakeup = NULL;
   1902     }
   1903     // Cancel any outstanding state timeout event.
   1904     if (srpl_connection->state_timeout != NULL) {
   1905         ioloop_cancel_wake_event(srpl_connection->state_timeout);
   1906         ioloop_wakeup_release(srpl_connection->state_timeout);
   1907         srpl_connection->state_timeout = NULL;
   1908     }
   1909     srpl_connection_reset(srpl_connection);
   1910     srpl_connection_next_state(srpl_connection, srpl_state_disconnect);
   1911 }
   1912 
   1913 static bool
   1914 srpl_session_message_parse_in(int index, const uint8_t *buffer, unsigned *offp, uint16_t length, void *context)
   1915 {
   1916     srpl_session_t *session = context;
   1917 
   1918     switch(index) {
   1919     case 0:
   1920         session->new_partner = true;
   1921         return true;
   1922     case 1:
   1923         return dns_name_parse(&session->domain_name, buffer, length, offp, length);
   1924     case 2:
   1925         return dns_u16_parse(buffer, length, offp, &session->remote_version);
   1926     }
   1927     return false;
   1928 }
   1929 
   1930 static bool
   1931 srpl_session_message_parse(srpl_connection_t *srpl_connection,
   1932                            srpl_event_t *event, dso_state_t *dso, const char *message_name)
   1933 {
   1934     const char *names[3] = { "New Partner", "Domain Name", "Protocol Version" };
   1935     dso_message_types_t additionals[3] = { kDSOType_SRPLNewPartner, kDSOType_SRPLDomainName, kDSOType_SRPLVersion };
   1936     bool required[3] = { false, false, false };
   1937     bool multiple[3] = { false, false, false };
   1938     int indices[3];
   1939 
   1940     if (dso->primary.length != 8) {
   1941         ERROR(PRI_S_SRP ": invalid DSO Primary length %d for " PUB_S_SRP ".",
   1942               srpl_connection->name, dso->primary.length, message_name);
   1943         return false;
   1944     }
   1945 
   1946     unsigned offp = 0;
   1947     srpl_event_content_type_set(event, srpl_event_content_type_session);
   1948     if (!dns_u64_parse(dso->primary.payload, 8, &offp, &event->content.session.partner_id)) {
   1949         // This should be un-possible.
   1950         ERROR(PRI_S_SRP ": invalid DSO Primary server id in " PRI_S_SRP ".",
   1951               srpl_connection->name, message_name);
   1952         return false;
   1953     }
   1954 
   1955     event->content.session.new_partner = false;
   1956     if (!srpl_find_dso_additionals(srpl_connection, dso, additionals, required, multiple, names, indices, 3, 0, 3,
   1957                                    "SRPLSession message", &(event->content.session), srpl_session_message_parse_in)) {
   1958         return false;
   1959     }
   1960 
   1961     srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   1962     if (domain == NULL) {
   1963         ERROR("connection has no domain.");
   1964         return false;
   1965     }
   1966     // If this is an unidentified connection that is associated with a temporary instance and
   1967     // a temporary domain, we need to retrieve the domain name from the session message and
   1968     // find the real domain for this connection.
   1969     // A connection can not be identified due to either
   1970     // the sending partner is still in the startup state and has not advertised yet; or
   1971     // the sending partner is in the routine state and has advertised the domain, but
   1972     // the receiving partner has not discovered it yet.
   1973     DNS_NAME_GEN_SRP(event->content.session.domain_name, dname_buf);
   1974     if (domain->name == NULL) {
   1975         srp_server_t *server_state = domain->server_state;
   1976         if (server_state == NULL) {
   1977             ERROR("server state is NULL.");
   1978             return false;
   1979         }
   1980         if (event->content.session.domain_name == NULL) {
   1981             ERROR(PUB_S_SRP " does not include domain name", message_name);
   1982             return false;
   1983         }
   1984         srpl_domain_t **dp, *match_domain = NULL;
   1985         // Find the domain.
   1986         for (dp = &server_state->srpl_domains; *dp; dp = &(*dp)->next) {
   1987             match_domain = *dp;
   1988             if (!strcasecmp(match_domain->name, dname_buf)) {
   1989                 break;
   1990             }
   1991         }
   1992         if (match_domain == NULL) {
   1993             ERROR("domain name in " PUB_S_SRP " does not match any domain", message_name);
   1994             return false;
   1995         }
   1996         RELEASE_HERE(srpl_connection->instance->domain, srpl_domain);
   1997         srpl_connection->instance->domain = match_domain;
   1998         RETAIN_HERE(match_domain, srpl_domain);
   1999     }
   2000 
   2001 
   2002     if (event->content.session.remote_version >= SRPL_VERSION_MULTI_HOST_MESSAGE) {
   2003         srpl_connection->variation_mask |= SRPL_VARIATION_MULTI_HOST_MESSAGE;
   2004     }
   2005 
   2006     INFO(PRI_S_SRP " received " PUB_S_SRP ", id %" PRIx64 ", startup " PUB_S_SRP
   2007          ", domain " PRI_S_SRP ", version %d", srpl_connection->name, message_name,
   2008          event->content.session.partner_id, event->content.session.new_partner? "yes" : "no",
   2009          dname_buf, event->content.session.remote_version);
   2010     return true;
   2011 }
   2012 
   2013 static void
   2014 srpl_session_message(srpl_connection_t *srpl_connection, message_t *message, dso_state_t *dso)
   2015 {
   2016     srpl_event_t event;
   2017     srpl_event_initialize(&event, srpl_event_session_message_received);
   2018 
   2019     srpl_connection_message_set(srpl_connection, message);
   2020     if (!srpl_session_message_parse(srpl_connection, &event, dso, "SRPLSession message")) {
   2021         dns_name_free(event.content.session.domain_name);
   2022         srpl_disconnect(srpl_connection);
   2023         return;
   2024     }
   2025     srpl_event_deliver(srpl_connection, &event);
   2026     dns_name_free(event.content.session.domain_name);
   2027 }
   2028 
   2029 static void
   2030 srpl_session_response(srpl_connection_t *srpl_connection, dso_state_t *dso)
   2031 {
   2032     srpl_event_t event;
   2033     srpl_event_initialize(&event, srpl_event_session_response_received);
   2034     if (!srpl_session_message_parse(srpl_connection, &event, dso, "SRPLSession response")) {
   2035         srpl_disconnect(srpl_connection);
   2036         return;
   2037     }
   2038 
   2039     srpl_event_deliver(srpl_connection, &event);
   2040     dns_name_free(event.content.session.domain_name);
   2041 }
   2042 
   2043 static bool
   2044 srpl_send_candidates_message_parse(srpl_connection_t *srpl_connection, dso_state_t *dso, const char *message_name)
   2045 {
   2046     if (dso->primary.length != 0) {
   2047         ERROR(PRI_S_SRP ": invalid DSO Primary length %d for " PUB_S_SRP ".",
   2048               srpl_connection->name, dso->primary.length, message_name);
   2049         srpl_disconnect(srpl_connection);
   2050         return false;
   2051     }
   2052     return true;
   2053 }
   2054 
   2055 static void
   2056 srpl_send_candidates_message(srpl_connection_t *srpl_connection, message_t *message, dso_state_t *dso)
   2057 {
   2058     srpl_event_t event;
   2059     srpl_event_initialize(&event, srpl_event_send_candidates_message_received);
   2060 
   2061     srpl_connection_message_set(srpl_connection, message);
   2062     if (srpl_send_candidates_message_parse(srpl_connection, dso, "SRPLSendCandidates message")) {
   2063         INFO(PRI_S_SRP " received SRPLSendCandidates query", srpl_connection->name);
   2064 
   2065         srpl_event_deliver(srpl_connection, &event);
   2066         return;
   2067     }
   2068     srpl_disconnect(srpl_connection);
   2069 }
   2070 
   2071 static void
   2072 srpl_send_candidates_response(srpl_connection_t *srpl_connection, dso_state_t *dso)
   2073 {
   2074     srpl_event_t event;
   2075     srpl_event_initialize(&event, srpl_event_send_candidates_response_received);
   2076 
   2077     if (srpl_send_candidates_message_parse(srpl_connection, dso, "SRPLSendCandidates message")) {
   2078         INFO(PRI_S_SRP " received SRPLSendCandidates response", srpl_connection->name);
   2079         srpl_event_deliver(srpl_connection, &event);
   2080         return;
   2081     }
   2082 }
   2083 
   2084 static bool
   2085 srpl_candidate_message_parse_in(int index, const uint8_t *buffer, unsigned *offp, uint16_t length, void *context)
   2086 {
   2087     srpl_candidate_t *candidate = context;
   2088 
   2089     switch(index) {
   2090     case 0:
   2091         return dns_name_parse(&candidate->name, buffer, length, offp, length);
   2092     case 1:
   2093         return dns_u32_parse(buffer, length, offp, &candidate->update_offset);
   2094     case 2:
   2095         return dns_u32_parse(buffer, length, offp, &candidate->key_id);
   2096     }
   2097     return false;
   2098 }
   2099 
   2100 static void
   2101 srpl_candidate_message(srpl_connection_t *srpl_connection, message_t *message, dso_state_t *dso)
   2102 {
   2103     const char *names[3] = { "Candidate Name", "Time Offset", "Key ID" };
   2104     dso_message_types_t additionals[3] = { kDSOType_SRPLHostname, kDSOType_SRPLTimeOffset, kDSOType_SRPLKeyID };
   2105     bool required[3] = { true, true, true };
   2106     int indices[3];
   2107 
   2108     srpl_event_t event;
   2109     srpl_event_initialize(&event, srpl_event_candidate_received);
   2110     srpl_connection_message_set(srpl_connection, message);
   2111     if (!srpl_event_content_type_set(&event, srpl_event_content_type_candidate) ||
   2112         !srpl_find_dso_additionals(srpl_connection, dso, additionals,
   2113                                    required, NULL, names, indices, 3, 3, 3, "SRPLCandidate message",
   2114                                    event.content.candidate, srpl_candidate_message_parse_in)) {
   2115         goto fail;
   2116     }
   2117 
   2118     event.content.candidate->update_time = srp_time() - event.content.candidate->update_offset;
   2119     srpl_event_deliver(srpl_connection, &event);
   2120     srpl_event_content_type_set(&event, srpl_event_content_type_none);
   2121     return;
   2122 
   2123 fail:
   2124     srpl_disconnect(srpl_connection);
   2125 }
   2126 
   2127 static bool
   2128 srpl_candidate_response_parse_in(int index,
   2129                                  const uint8_t *UNUSED buffer, unsigned *offp, uint16_t length, void *context)
   2130 {
   2131     srpl_candidate_disposition_t *candidate_disposition = context;
   2132 
   2133     if (length != 0) {
   2134         return false;
   2135     }
   2136 
   2137     switch(index) {
   2138     case 0:
   2139         *candidate_disposition = srpl_candidate_yes;
   2140         break;
   2141     case 1:
   2142         *candidate_disposition = srpl_candidate_no;
   2143         break;
   2144     case 2:
   2145         *candidate_disposition = srpl_candidate_conflict;
   2146         break;
   2147     }
   2148     *offp = 0;
   2149     return true;
   2150 }
   2151 
   2152 static void
   2153 srpl_candidate_response(srpl_connection_t *srpl_connection, dso_state_t *dso)
   2154 {
   2155     const char *names[3] = { "Candidate Yes", "Candidate No", "Conflict" };
   2156     dso_message_types_t additionals[3] = { kDSOType_SRPLCandidateYes, kDSOType_SRPLCandidateNo, kDSOType_SRPLConflict };
   2157     bool required[3] = { false, false, false };
   2158     int indices[3];
   2159     srpl_event_t event;
   2160 
   2161     srpl_event_initialize(&event, srpl_event_candidate_response_received);
   2162     srpl_event_content_type_set(&event, srpl_event_content_type_candidate_disposition);
   2163     if (!srpl_find_dso_additionals(srpl_connection, dso, additionals,
   2164                                    required, NULL, names, indices, 3, 1, 1, "SRPLCandidate reply",
   2165                                    &event.content.disposition, srpl_candidate_response_parse_in)) {
   2166         goto fail;
   2167     }
   2168     srpl_event_deliver(srpl_connection, &event);
   2169     return;
   2170 
   2171 fail:
   2172     srpl_disconnect(srpl_connection);
   2173 }
   2174 
   2175 static bool
   2176 srpl_host_message_parse_in(int index, const uint8_t *buffer, unsigned *offp, uint16_t length, void *context)
   2177 {
   2178     srpl_host_update_t *update = context;
   2179 
   2180     switch(index) {
   2181     case 0: // Host Name
   2182         if (update->hostname == NULL) {
   2183             unsigned offp_orig = *offp;
   2184             bool ret = dns_name_parse(&update->hostname, buffer, length, offp, length);
   2185             update->num_bytes = *offp - offp_orig;
   2186             update->orig_buffer = (intptr_t)buffer;
   2187             return ret;
   2188         } else {
   2189             if ((intptr_t)buffer == update->orig_buffer) {
   2190                 (*offp) += update->num_bytes;
   2191             }
   2192             return true;
   2193         }
   2194     case 1: // Host Message
   2195         if (update->messages != NULL) {
   2196             const uint8_t *message_buffer;
   2197             size_t message_length;
   2198             if (update->rcode) {
   2199                 message_buffer = buffer + 12; // lease, key-lease, time offset
   2200                 message_length = length - 12;
   2201             } else {
   2202                 message_buffer = buffer;
   2203                 message_length = length;
   2204             }
   2205             message_t *message = ioloop_message_create(message_length);
   2206             if (message == NULL) {
   2207                 return false;
   2208             }
   2209             if (update->rcode) {
   2210                 uint32_t time_offset = 0;
   2211                 if (!(dns_u32_parse(buffer, length, offp, &message->lease) &&
   2212                       dns_u32_parse(buffer, length, offp, &message->key_lease) &&
   2213                       dns_u32_parse(buffer, length, offp, &time_offset)))
   2214                 {
   2215                     INFO("failed to parse lease, key_lease or time_offset");
   2216                     return false;
   2217                 }
   2218                 message->received_time = srp_time() - time_offset;
   2219             }
   2220             memcpy(&message->wire, message_buffer, message_length);
   2221 
   2222             // We are parsing across the same message, so we can't exceed max_messages here.
   2223             update->messages[update->num_messages++] = message;
   2224         } else {
   2225             update->max_messages++;
   2226         }
   2227         *offp = length;
   2228         return true;
   2229     case 2: // Server Stable ID
   2230         return dns_u64_parse(buffer, length, offp, &update->server_stable_id);
   2231     case 3: // Time Offset
   2232         return dns_u32_parse(buffer, length, offp, &update->update_offset);
   2233     }
   2234     return false;
   2235 }
   2236 
   2237 static void
   2238 srpl_host_message(srpl_connection_t *srpl_connection, message_t *message, dso_state_t *dso)
   2239 {
   2240     srpl_event_t event;
   2241     memset(&event, 0, sizeof(event));
   2242     srpl_connection_message_set(srpl_connection, message);
   2243     if (dso->primary.length != 0) {
   2244         ERROR(PRI_S_SRP ": invalid DSO Primary length %d for SRPLHost message.",
   2245               srpl_connection->name, dso->primary.length);
   2246         goto fail;
   2247     } else {
   2248         const char *names[4] = { "Host Name", "Host Message", "Server Stable ID", "Time Offset" };
   2249         dso_message_types_t additionals[4] = { kDSOType_SRPLHostname, kDSOType_SRPLHostMessage,
   2250             kDSOType_SRPLServerStableID, kDSOType_SRPLTimeOffset };
   2251         bool required[4] = { true, true, false, true };
   2252         bool multiple[4] = { false, true, false, false };
   2253         int indices[4];
   2254         int num_additls = 4;
   2255 
   2256         // Parse host message
   2257         srpl_event_initialize(&event, srpl_event_host_message_received);
   2258         srpl_event_content_type_set(&event, srpl_event_content_type_host_update);
   2259         if (SRPL_SUPPORTS(srpl_connection, SRPL_VARIATION_MULTI_HOST_MESSAGE)) {
   2260             num_additls--;
   2261             event.content.host_update.rcode = 1; // temporarily use rcode for flag
   2262         }
   2263 
   2264         if (!srpl_find_dso_additionals(srpl_connection, dso, additionals, required, multiple, names, indices,
   2265                                        num_additls, num_additls - 1, num_additls + MAX_ADDITIONAL_HOST_MESSAGES,
   2266                                        "SRPLHost message", &event.content.host_update, srpl_host_message_parse_in)) {
   2267             goto fail;
   2268         }
   2269         // update->max_messages can't be zero here, or we would have gotten a false return from
   2270         // srpl_find_dso_additionals and not gotten here.
   2271         event.content.host_update.messages = calloc(event.content.host_update.max_messages,
   2272                                                     sizeof (*event.content.host_update.messages));
   2273         if (event.content.host_update.messages == NULL) {
   2274             goto fail;
   2275         }
   2276         // Now that we know how many messages, we can copy them out.
   2277         if (!srpl_find_dso_additionals(srpl_connection, dso, additionals, required, multiple, names, indices,
   2278                                        num_additls, num_additls - 1, num_additls + MAX_ADDITIONAL_HOST_MESSAGES,
   2279                                        "SRPLHost message", &event.content.host_update, srpl_host_message_parse_in)) {
   2280             goto fail;
   2281         }
   2282         DNS_NAME_GEN_SRP(event.content.host_update.hostname, hostname_buf);
   2283         if (!SRPL_SUPPORTS(srpl_connection, SRPL_VARIATION_MULTI_HOST_MESSAGE)) {
   2284             time_t update_time = srp_time() - event.content.host_update.update_offset;
   2285             event.content.host_update.messages[0]->received_time = update_time;
   2286             INFO(PRI_S_SRP " received SRPLHost message %x for " PRI_DNS_NAME_SRP " server stable ID %" PRIx64
   2287                  " update offset = %d", srpl_connection->name, ntohs(message->wire.id),
   2288                  DNS_NAME_PARAM_SRP(event.content.host_update.hostname, hostname_buf),
   2289                  event.content.host_update.server_stable_id, event.content.host_update.update_offset);
   2290         } else {
   2291             // Make sure times are sequential.
   2292             time_t last_received_time = event.content.host_update.messages[0]->received_time;
   2293             INFO(PRI_S_SRP " received SRPLHost message %x for " PRI_DNS_NAME_SRP " server stable ID %" PRIx64
   2294                  " message 0 received time = %ld", srpl_connection->name, ntohs(message->wire.id),
   2295                  DNS_NAME_PARAM_SRP(event.content.host_update.hostname, hostname_buf),
   2296                  event.content.host_update.server_stable_id, srp_time() - last_received_time);
   2297             for (int i = 1; i < event.content.host_update.num_messages; i++) {
   2298                 time_t cur_received_time = event.content.host_update.messages[i]->received_time;
   2299                 if (cur_received_time - last_received_time <= 0) {
   2300                     INFO(PRI_S_SRP
   2301                          " received invalid SRPLHost message %x with message %d time %lld <= message %d time %lld",
   2302                          srpl_connection->name, ntohs(event.content.host_update.messages[i]->wire.id),
   2303                          i, (long long)cur_received_time, i - 1, (long long)last_received_time);
   2304                     goto fail_no_message;
   2305                 }
   2306                 INFO("message %d received time = %ld", i, cur_received_time);
   2307                 last_received_time = cur_received_time;
   2308             }
   2309         }
   2310         event.content.host_update.rcode = 0;
   2311         srpl_event_deliver(srpl_connection, &event);
   2312         srpl_event_content_type_set(&event, srpl_event_content_type_none);
   2313     }
   2314     return;
   2315 fail:
   2316     INFO(PRI_S_SRP " received invalid SRPLHost message %x", srpl_connection->name, ntohs(message->wire.id));
   2317 fail_no_message:
   2318     if (event.content_type == srpl_event_content_type_host_update) {
   2319         srpl_event_content_type_set(&event, srpl_event_content_type_none);
   2320     }
   2321     srpl_disconnect(srpl_connection);
   2322     return;
   2323 }
   2324 
   2325 static void
   2326 srpl_host_response(srpl_connection_t *srpl_connection, message_t *message, dso_state_t *dso)
   2327 {
   2328     if (dso->primary.length != 0) {
   2329         ERROR(PRI_S_SRP ": invalid DSO Primary length %d for SRPLHost response.",
   2330               srpl_connection->name, dso->primary.length);
   2331         srpl_disconnect(srpl_connection);
   2332         return;
   2333     } else {
   2334         srpl_event_t event;
   2335         INFO(PRI_S_SRP " received SRPLHost response %x", srpl_connection->name, ntohs(message->wire.id));
   2336         srpl_event_initialize(&event, srpl_event_host_response_received);
   2337         srpl_event_content_type_set(&event, srpl_event_content_type_rcode);
   2338         event.content.rcode = dns_rcode_get(&message->wire);
   2339         srpl_event_deliver(srpl_connection, &event);
   2340         srpl_event_content_type_set(&event, srpl_event_content_type_none);
   2341         return;
   2342     }
   2343 }
   2344 
   2345 static void
   2346 srpl_keepalive_receive(srpl_connection_t *srpl_connection, int keepalive_interval, uint16_t xid)
   2347 {
   2348     if (srpl_connection->is_server) {
   2349         int num_standby = 0;
   2350         srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   2351         if (domain != NULL && domain->server_state != NULL) {
   2352             for (srpl_instance_t *unmatched = domain->server_state->unmatched_instances; unmatched != NULL; unmatched = unmatched->next) {
   2353                 srpl_connection_t *unidentified = unmatched->connection;
   2354                 if (unidentified != NULL && unidentified->state > srpl_state_session_evaluate) {
   2355                     num_standby++;
   2356                 }
   2357             }
   2358         }
   2359         int new_interval = num_standby * DEFAULT_KEEPALIVE_WAKEUP_EXPIRY / 2;
   2360         if (new_interval > 0 && srpl_connection->keepalive_interval != new_interval) {
   2361             srpl_connection->keepalive_interval = new_interval;
   2362             INFO("suggest keepalive %d for connection " PRI_S_SRP, new_interval, srpl_connection->name);
   2363         }
   2364         srpl_keepalive_send(srpl_connection, true, xid);
   2365     } else {
   2366         if (srpl_connection->state <= srpl_state_sync_wait) {
   2367             INFO("keepalive for connection " PRI_S_SRP " - old %d, new %d.", srpl_connection->name,
   2368                  srpl_connection->keepalive_interval, keepalive_interval);
   2369             srpl_connection->keepalive_interval = keepalive_interval;
   2370         }
   2371     }
   2372 }
   2373 
   2374 static void
   2375 srpl_dso_retry_delay(srpl_connection_t *srpl_connection, int reconnect_delay)
   2376 {
   2377     if (srpl_connection->instance == NULL) {
   2378         // If there's no instance, we're already disconnecting.
   2379         INFO(PRI_S_SRP ": no instance", srpl_connection->name);
   2380         return;
   2381     }
   2382     srpl_instance_t *instance = srpl_connection->instance;
   2383     RETAIN_HERE(srpl_connection, srpl_connection); // In case there's only one reference left.
   2384     if (instance->unmatched) {
   2385         INFO(PRI_S_SRP ": not sending retry delay for %d seconds because unidentified", srpl_connection->name, reconnect_delay);
   2386         if (instance->connection == srpl_connection) {
   2387             RELEASE_HERE(instance->connection, srpl_connection);
   2388             instance->connection = NULL;
   2389         }
   2390     } else {
   2391         INFO(PRI_S_SRP ": sending retry delay for %d seconds", srpl_connection->name, reconnect_delay);
   2392 
   2393         // Set things up to reconnect later.
   2394         srpl_connection_drop_state_delay(instance, srpl_connection, reconnect_delay);
   2395     }
   2396 
   2397     // Drop the connection
   2398     srpl_connection_discontinue(srpl_connection);
   2399     RELEASE_HERE(srpl_connection, srpl_connection); // For the function.
   2400 }
   2401 
   2402 static void
   2403 srpl_dso_message(srpl_connection_t *srpl_connection, message_t *message, dso_state_t *dso, bool response)
   2404 {
   2405     const char *name = "<null>";
   2406     if (srpl_connection != NULL) {
   2407         if (srpl_connection->instance != NULL) {
   2408             name = srpl_connection->instance->instance_name;
   2409         } else {
   2410             name = srpl_connection->name;
   2411         }
   2412     }
   2413 
   2414     switch(dso->primary.opcode) {
   2415     case kDSOType_SRPLSession:
   2416         if (response) {
   2417             srpl_session_response(srpl_connection, dso);
   2418         } else {
   2419             srpl_session_message(srpl_connection, message, dso);
   2420         }
   2421         break;
   2422 
   2423     case kDSOType_SRPLSendCandidates:
   2424         if (response) {
   2425             srpl_send_candidates_response(srpl_connection, dso);
   2426         } else {
   2427             srpl_send_candidates_message(srpl_connection, message, dso);
   2428         }
   2429         break;
   2430 
   2431     case kDSOType_SRPLCandidate:
   2432         if (response) {
   2433             srpl_candidate_response(srpl_connection, dso);
   2434         } else {
   2435             srpl_candidate_message(srpl_connection, message, dso);
   2436         }
   2437         break;
   2438 
   2439     case kDSOType_SRPLHost:
   2440         if (response) {
   2441             srpl_host_response(srpl_connection, message, dso);
   2442         } else {
   2443             srpl_host_message(srpl_connection, message, dso);
   2444         }
   2445         break;
   2446 
   2447     case kDSOType_Keepalive:
   2448         if (response) {
   2449             INFO(PRI_S_SRP ": keepalive response, xid %04x", name, message->wire.id);
   2450         } else if (message->wire.id) {
   2451             INFO(PRI_S_SRP ": keepalive query, xid %04x", name, message->wire.id);
   2452         } else {
   2453             INFO(PRI_S_SRP ": keepalive unidirectional, xid %04x (should be zero)", name, message->wire.id);
   2454         }
   2455         break;
   2456 
   2457     default:
   2458         INFO(PRI_S_SRP ": unexpected primary TLV %d", name, dso->primary.opcode);
   2459         dso_simple_response(srpl_connection->connection, NULL, &message->wire, dns_rcode_dsotypeni);
   2460         break;
   2461     }
   2462 
   2463 }
   2464 
   2465 // We should never get here. If we do, it means that we haven't gotten a keepalive in the required interval.
   2466 static void
   2467 srpl_keepalive_receive_wakeup(void *context)
   2468 {
   2469     srpl_connection_t *srpl_connection = context;
   2470 
   2471     INFO(PUB_S_SRP ": nothing heard from partner across keepalive interval--disconnecting", srpl_connection->name);
   2472     srpl_connection_discontinue(srpl_connection); // Drop the connection, don't send a retry_delay.
   2473 }
   2474 
   2475 static void
   2476 srpl_instance_dso_event_callback(void *context, void *event_context, dso_state_t *dso, dso_event_type_t eventType)
   2477 {
   2478     message_t *message;
   2479     dso_query_receive_context_t *response_context;
   2480     dso_disconnect_context_t *disconnect_context;
   2481     dso_keepalive_context_t *keepalive_context;
   2482     srpl_connection_t *srpl_connection = context;
   2483     const char *name = "<null>";
   2484     if (srpl_connection != NULL) {
   2485         if (srpl_connection->instance != NULL) {
   2486             name = srpl_connection->instance->instance_name;
   2487         } else {
   2488             name = srpl_connection->name;
   2489         }
   2490     }
   2491 
   2492     switch(eventType)
   2493     {
   2494     case kDSOEventType_DNSMessage:
   2495         // We shouldn't get here because we already handled any DNS messages
   2496         message = event_context;
   2497         INFO(PRI_S_SRP ": DNS Message (opcode=%d) received from " PRI_S_SRP,
   2498              name, dns_opcode_get(&message->wire), dso->remote_name);
   2499         break;
   2500     case kDSOEventType_DNSResponse:
   2501         // We shouldn't get here because we already handled any DNS messages
   2502         message = event_context;
   2503         INFO(PRI_S_SRP ": DNS Response (opcode=%d) received from " PRI_S_SRP,
   2504              name, dns_opcode_get(&message->wire), dso->remote_name);
   2505         break;
   2506     case kDSOEventType_DSOMessage:
   2507         INFO(PRI_S_SRP ": DSO Message (Primary TLV=%d) received from " PRI_S_SRP,
   2508              name, dso->primary.opcode, dso->remote_name);
   2509         srpl_connection->last_message_received = srp_time();
   2510         ioloop_add_wake_event(srpl_connection->keepalive_receive_wakeup,
   2511                               srpl_connection, srpl_keepalive_receive_wakeup, srpl_connection_context_release,
   2512                               srpl_connection->keepalive_interval * 2);
   2513         RETAIN_HERE(srpl_connection, srpl_connection); // for the callback
   2514         message = event_context;
   2515         srpl_dso_message(srpl_connection, message, dso, false);
   2516         break;
   2517     case kDSOEventType_DSOResponse:
   2518         INFO(PRI_S_SRP ": DSO Response (Primary TLV=%d) received from " PRI_S_SRP,
   2519              name, dso->primary.opcode, dso->remote_name);
   2520         srpl_connection->last_message_received = srp_time();
   2521         ioloop_add_wake_event(srpl_connection->keepalive_receive_wakeup,
   2522                               srpl_connection, srpl_keepalive_receive_wakeup, srpl_connection_context_release,
   2523                               srpl_connection->keepalive_interval * 2);
   2524         RETAIN_HERE(srpl_connection, srpl_connection); // for the callback
   2525         response_context = event_context;
   2526         message = response_context->message_context;
   2527         srpl_dso_message(srpl_connection, message, dso, true);
   2528         break;
   2529 
   2530     case kDSOEventType_Finalize:
   2531         INFO("Finalize");
   2532         break;
   2533 
   2534     case kDSOEventType_Connected:
   2535         INFO("Connected to " PRI_S_SRP, dso->remote_name);
   2536         break;
   2537 
   2538     case kDSOEventType_ConnectFailed:
   2539         INFO("Connection to " PRI_S_SRP " failed", dso->remote_name);
   2540         break;
   2541 
   2542     case kDSOEventType_Disconnected:
   2543         INFO("Connection to " PRI_S_SRP " disconnected", dso->remote_name);
   2544         break;
   2545     case kDSOEventType_ShouldReconnect:
   2546         INFO("Connection to " PRI_S_SRP " should reconnect (not for a server)", dso->remote_name);
   2547         break;
   2548     case kDSOEventType_Inactive:
   2549         INFO(PRI_S_SRP "Inactivity timer went off, closing connection.", name);
   2550         break;
   2551     case kDSOEventType_Keepalive:
   2552         INFO("should send a keepalive now.");
   2553         break;
   2554     case kDSOEventType_KeepaliveRcvd:
   2555         keepalive_context = event_context;
   2556         keepalive_context->send_response = false;
   2557         INFO(PRI_S_SRP ": keepalive received, xid %04x.", name, keepalive_context->xid);
   2558         srpl_keepalive_receive(srpl_connection, keepalive_context->keepalive_interval, keepalive_context->xid);
   2559         srpl_connection->last_message_received = srp_time();
   2560         ioloop_add_wake_event(srpl_connection->keepalive_receive_wakeup,
   2561                               srpl_connection, srpl_keepalive_receive_wakeup, srpl_connection_context_release,
   2562                               srpl_connection->keepalive_interval * 2);
   2563         RETAIN_HERE(srpl_connection, srpl_connection); // for the callback
   2564         break;
   2565     case kDSOEventType_RetryDelay:
   2566         disconnect_context = event_context;
   2567         INFO(PRI_S_SRP ": retry delay received, %d seconds", name, disconnect_context->reconnect_delay);
   2568         srpl_dso_retry_delay(srpl_connection, disconnect_context->reconnect_delay);
   2569         break;
   2570     }
   2571 }
   2572 
   2573 static void
   2574 srpl_datagram_callback(comm_t *comm, message_t *message, void *context)
   2575 {
   2576     srpl_connection_t *srpl_connection = context;
   2577     srpl_instance_t *instance = srpl_connection->instance;
   2578     if (instance == NULL) {
   2579         INFO("datagram on connection " PRI_S_SRP " with no instance.", comm->name);
   2580         return;
   2581     }
   2582 
   2583     // If this is a DSO message, see if we have a session yet.
   2584     switch(dns_opcode_get(&message->wire)) {
   2585     case dns_opcode_dso:
   2586         if (srpl_connection->dso == NULL) {
   2587             INFO("dso message received with no DSO object on instance " PRI_S_SRP, instance->instance_name);
   2588             srpl_disconnect(srpl_connection);
   2589             return;
   2590         }
   2591         dso_message_received(srpl_connection->dso, (uint8_t *)&message->wire, message->length, message);
   2592         return;
   2593         break;
   2594     }
   2595     INFO("datagram on connection " PRI_S_SRP " not handled, type = %d.",
   2596          comm->name, dns_opcode_get(&message->wire));
   2597 }
   2598 
   2599 static void
   2600 srpl_connection_dso_cleanup(void *UNUSED context)
   2601 {
   2602     dso_cleanup(false);
   2603 }
   2604 
   2605 // Call this to break the current srpl_connection without sending the state machine into idle.
   2606 static void
   2607 srpl_trigger_disconnect(srpl_connection_t *srpl_connection)
   2608 {
   2609     // Trigger a disconnect
   2610     if (srpl_connection->dso != NULL) {
   2611         dso_state_cancel(srpl_connection->dso);
   2612         srpl_connection->dso = NULL;
   2613     } else {
   2614         ioloop_comm_cancel(srpl_connection->connection);
   2615         ioloop_comm_release(srpl_connection->connection);
   2616         gettimeofday(&srpl_connection->connection_null_time, NULL);
   2617         srpl_connection->connection_null_reason = "trigger_disconnect";
   2618         srpl_connection->connection = NULL;
   2619     }
   2620 }
   2621 
   2622 static bool
   2623 srpl_connection_dso_life_cycle_callback(dso_life_cycle_t cycle, void *const context, dso_state_t *const dso)
   2624 {
   2625     if (cycle == dso_life_cycle_cancel) {
   2626         srpl_connection_t *srpl_connection = context;
   2627         INFO(PRI_S_SRP ": %p %p", srpl_connection->name, srpl_connection, dso);
   2628         if (srpl_connection->connection != NULL) {
   2629             ioloop_comm_cancel(srpl_connection->connection);
   2630             srpl_connection->connection->dso = NULL;
   2631             ioloop_comm_release(srpl_connection->connection);
   2632             gettimeofday(&srpl_connection->connection_null_time, NULL);
   2633             srpl_connection->connection_null_reason = "dso_life_cycle_callback";
   2634             srpl_connection->connection = NULL;
   2635         }
   2636         srpl_connection_reset(srpl_connection);
   2637         srpl_connection->dso = NULL;
   2638         RELEASE_HERE(srpl_connection, srpl_connection);
   2639         ioloop_run_async(srpl_connection_dso_cleanup, NULL);
   2640         return true;
   2641     }
   2642     return false;
   2643 }
   2644 
   2645 static void
   2646 srpl_associate_incoming_with_instance(comm_t *connection, message_t *message,
   2647                                       dso_state_t *dso, srpl_instance_t *instance)
   2648 {
   2649     srpl_connection_t *old_connection = NULL;
   2650 
   2651     srpl_connection_t *srpl_connection = srpl_connection_create(instance, false);
   2652     if (srpl_connection == NULL) {
   2653         ioloop_comm_cancel(connection);
   2654         return;
   2655     }
   2656 
   2657 
   2658     srpl_connection->connection = connection;
   2659     ioloop_comm_retain(srpl_connection->connection);
   2660 
   2661     srpl_connection->dso = dso;
   2662     srpl_connection->instance = instance;
   2663     srpl_connection->connected_address = connection->address;
   2664     srpl_connection->state = srpl_state_session_message_wait;
   2665 
   2666     dso_set_event_context(dso, srpl_connection);
   2667     RETAIN_HERE(srpl_connection, srpl_connection); // dso holds reference.
   2668     dso_set_event_callback(dso, srpl_instance_dso_event_callback);
   2669     dso_set_life_cycle_callback(dso, srpl_connection_dso_life_cycle_callback);
   2670 
   2671     connection->datagram_callback = srpl_datagram_callback;
   2672     connection->disconnected = srpl_disconnected_callback;
   2673     ioloop_comm_context_set(connection, srpl_connection, srpl_connection_context_release);
   2674     RETAIN_HERE(srpl_connection, srpl_connection); // the connection has a reference.
   2675 
   2676     srpl_connection_next_state(srpl_connection, srpl_state_session_message_wait);
   2677     srpl_instance_dso_event_callback(srpl_connection, message, dso, kDSOEventType_DSOMessage);
   2678 
   2679     // We drop it after we set it up because that lets us send a retry_delay to the peer.
   2680     if (instance->domain == NULL || instance->domain->srpl_opstate != SRPL_OPSTATE_ROUTINE) {
   2681         INFO(PRI_S_SRP ": dropping peer reconnect because we aren't in routine state.", instance->instance_name);
   2682         RELEASE_HERE(instance, srpl_instance);
   2683         srpl_connection->instance = NULL;
   2684         srpl_connection_discontinue(srpl_connection);
   2685         RELEASE_HERE(srpl_connection, srpl_connection);
   2686         return;
   2687     }
   2688 
   2689     // If we already have a connection with the remote partner, we replace it with the new connection.
   2690     if (instance->connection != NULL) {
   2691         INFO(PRI_S_SRP ": we already have a connection (%p).", instance->instance_name, old_connection);
   2692         old_connection = instance->connection;
   2693         RELEASE_HERE(old_connection->instance, srpl_instance);
   2694         old_connection->instance = NULL;
   2695         srpl_connection_discontinue(old_connection);
   2696         RELEASE_HERE(old_connection, srpl_connection);
   2697     }
   2698     instance->connection = srpl_connection; // Retained via create/copy rule.
   2699 }
   2700 
   2701 static void
   2702 srpl_add_unidentified_server(comm_t *connection, message_t *message, dso_state_t *dso, srp_server_t *server_state)
   2703 {
   2704     srpl_instance_t **inp, *unmatched_instance = NULL;
   2705 
   2706     const char *instance_name;
   2707     char nbuf[kDNSServiceMaxDomainName];
   2708     // Take ip address as the instance name
   2709     if (connection->address.sa.sa_family == AF_INET6) {
   2710         instance_name = inet_ntop(AF_INET6, &connection->address.sin6.sin6_addr, nbuf, sizeof nbuf);
   2711     } else {
   2712         instance_name = inet_ntop(AF_INET, &connection->address.sin.sin_addr, nbuf, sizeof nbuf);
   2713     }
   2714 
   2715     // Check if an unmatched instance has been created for the same address
   2716     for (inp = &server_state->unmatched_instances; *inp != NULL; inp = &(*inp)->next) {
   2717         if (!strcmp((*inp)->instance_name, instance_name)) {
   2718             INFO("we already have an unmatched instance " PRI_S_SRP, instance_name);
   2719             unmatched_instance = *inp;
   2720             break;
   2721         }
   2722     }
   2723     if (unmatched_instance == NULL) {
   2724         INFO("create a temporary instance " PRI_S_SRP, instance_name);
   2725         unmatched_instance = calloc(1, sizeof(*unmatched_instance));
   2726         if (unmatched_instance == NULL) {
   2727             ERROR("no memory for unmatched instance");
   2728             return;
   2729         }
   2730         RETAIN_HERE(unmatched_instance, srpl_instance); // The unmatched instance list will hold this reference.
   2731         // Create a dummy domain because domain can not be decided at this point
   2732         srpl_domain_t *domain = calloc(1, sizeof(*domain));
   2733         if (domain == NULL) {
   2734             ERROR("no memory for domain structure");
   2735             RELEASE_HERE(unmatched_instance, srpl_instance);
   2736             return;
   2737         }
   2738         RETAIN_HERE(domain, srpl_domain);
   2739         unmatched_instance->domain = domain;
   2740         domain->server_state = server_state;
   2741         unmatched_instance->unmatched = true;
   2742 
   2743         unmatched_instance->instance_name = strdup(instance_name);
   2744         if (unmatched_instance->instance_name == NULL) {
   2745             ERROR("no memory for unmatched instance" PRI_S_SRP, instance_name);
   2746             RELEASE_HERE(unmatched_instance, srpl_instance);
   2747             return;
   2748         }
   2749         // Find the end of the list and append the newly created instance.
   2750         for (inp = &server_state->unmatched_instances; *inp != NULL; inp = &(*inp)->next) {
   2751         }
   2752         *inp = unmatched_instance;
   2753     }
   2754     srpl_associate_incoming_with_instance(connection, message, dso, unmatched_instance);
   2755 }
   2756 
   2757 static void
   2758 srpl_match_unidentified_with_instance(srpl_connection_t *connection,
   2759                                       srpl_instance_t *instance)
   2760 {
   2761     srpl_instance_t *cur = connection->instance;
   2762 
   2763     // Take the connection from its instance.
   2764     RETAIN_HERE(connection, srpl_connection); // for the function
   2765     RELEASE_HERE(cur->connection, srpl_connection);
   2766     cur->connection = NULL;
   2767     RELEASE_HERE(connection->instance, srpl_instance); // Get rid of the instance's reference to the connection
   2768     connection->instance = NULL;
   2769 
   2770     // Remove the currently associated instance from the unmatched_instances
   2771     srpl_instance_t **sp = NULL;
   2772 
   2773     INFO("matched temporary instance " PRI_S_SRP " to instance " PRI_S_SRP " with partner_id %" PRIx64,
   2774          cur->instance_name, instance->instance_name, instance->partner_id);
   2775     instance->version_mismatch = cur->version_mismatch;
   2776 #define POINTER_TO_HEX_MAX_STRLEN 19 // 0x<...>
   2777     size_t srpl_connection_name_length = strlen(instance->instance_name) + 2 + POINTER_TO_HEX_MAX_STRLEN + 3;
   2778     char *new_name = malloc(srpl_connection_name_length);
   2779     if (new_name != NULL) {
   2780         free(connection->name);
   2781         connection->name = new_name;
   2782         // unidentified connection is by definition an incoming connection.
   2783         snprintf(new_name, srpl_connection_name_length, "<%s (%p)", instance->instance_name, connection);
   2784     }
   2785     srp_server_t *server_state = cur->domain->server_state;
   2786     for (sp = &server_state->unmatched_instances; *sp; sp = &(*sp)->next) {
   2787         if (*sp == cur) {
   2788             *sp = cur->next;
   2789             break;
   2790         }
   2791     }
   2792 
   2793     // Release the unmatched instance list's reference to the unmatched instance. We already released the srpl_connection's
   2794     // reference, so this should dispose of the unmatched instance.
   2795     RELEASE_HERE(cur, srpl_instance);
   2796 
   2797     if (instance->domain == NULL || instance->domain->srpl_opstate != SRPL_OPSTATE_ROUTINE) {
   2798         INFO(PRI_S_SRP "dropping peer reconnect because we aren't in routine state.", instance->domain->name);
   2799         srpl_disconnect(connection);
   2800         goto out;
   2801     }
   2802 
   2803     if (connection->state == srpl_state_idle ||
   2804         connection->state == srpl_state_disconnect ||
   2805         connection->state == srpl_state_disconnect_wait)
   2806     {
   2807         INFO("connection " PRI_S_SRP " is in " PUB_S_SRP, connection->name, srpl_state_name(connection->state));
   2808         goto out;
   2809     }
   2810 
   2811     if (connection->database_synchronized) {
   2812         srpl_state_transition_by_dataset_id(instance->domain, instance);
   2813     }
   2814 
   2815     // Release any older connection we might have.
   2816     if (instance->connection) {
   2817         srpl_connection_t *old_connection = instance->connection;
   2818         if (old_connection->instance != NULL) {
   2819             RELEASE_HERE(old_connection->instance, srpl_instance);
   2820             old_connection->instance = NULL;
   2821         }
   2822         srpl_disconnect(instance->connection);
   2823         RELEASE_HERE(instance->connection, srpl_connection); // Instance still holds reference.
   2824         instance->connection = NULL;
   2825         INFO("release connection on instance " PRI_S_SRP " with partner_id %" PRIx64, instance->instance_name, instance->partner_id);
   2826     }
   2827     instance->connection = connection;
   2828     RETAIN_HERE(instance->connection, srpl_connection);
   2829     connection->instance = instance;
   2830     RETAIN_HERE(connection->instance, srpl_instance); // Retain on the connection
   2831 out:
   2832     RELEASE_HERE(connection, srpl_connection); // done with using it for this function.
   2833 }
   2834 
   2835 void
   2836 srpl_dso_server_message(comm_t *connection, message_t *message, dso_state_t *dso, srp_server_t *server_state)
   2837 {
   2838     srpl_domain_t *domain;
   2839     srpl_instance_t *instance;
   2840     srpl_instance_service_t *service;
   2841     address_query_t *address;
   2842     int i;
   2843 
   2844     // Figure out from which instance this connection originated
   2845     for (domain = server_state->srpl_domains; domain != NULL; domain = domain->next) {
   2846         for (instance = domain->instances; instance != NULL; instance = instance->next) {
   2847             for (service = instance->services; service != NULL; service = service->next) {
   2848                 address = service->address_query;
   2849                 if (address == NULL) {
   2850                     continue;
   2851                 }
   2852                 for (i = 0; i < address->num_addresses; i++) {
   2853                     if (ip_addresses_equal(&connection->address, &address->addresses[i])) {
   2854                         INFO("SRP Replication connection received from " PRI_S_SRP " on " PRI_S_SRP,
   2855                              address->hostname, connection->name);
   2856                         srpl_associate_incoming_with_instance(connection, message, dso, instance);
   2857                         return;
   2858                     }
   2859                 }
   2860             }
   2861         }
   2862     }
   2863 
   2864     INFO("incoming SRP Replication server connection from unrecognized server " PRI_S_SRP, connection->name);
   2865     srpl_add_unidentified_server(connection, message, dso, server_state);
   2866 }
   2867 
   2868 static void
   2869 srpl_connected(comm_t *connection, void *context)
   2870 {
   2871     srpl_connection_t *srpl_connection = context;
   2872 
   2873     INFO(PRI_S_SRP " connected", connection->name);
   2874     connection->dso = dso_state_create(false, 2, connection->name, srpl_instance_dso_event_callback,
   2875                                        srpl_connection, srpl_connection_dso_life_cycle_callback, connection);
   2876     if (connection->dso == NULL) {
   2877         ERROR(PRI_S_SRP " can't create dso state object.", srpl_connection->name);
   2878         srpl_disconnect(srpl_connection);
   2879         return;
   2880     }
   2881     RETAIN_HERE(srpl_connection, srpl_connection); // dso holds reference to connection
   2882     srpl_connection->dso = connection->dso;
   2883 
   2884     // Generate an event indicating that we've been connected
   2885     srpl_event_t event;
   2886     srpl_event_initialize(&event, srpl_event_connected);
   2887     srpl_event_deliver(srpl_connection, &event);
   2888 }
   2889 
   2890 static bool
   2891 srpl_connection_connect(srpl_connection_t *srpl_connection)
   2892 {
   2893     if (srpl_connection->instance == NULL) {
   2894         ERROR(PRI_S_SRP ": no instance to connect to", srpl_connection->name);
   2895         return false;
   2896     }
   2897     srpl_connection->connection = ioloop_connection_create(&srpl_connection->connected_address,
   2898                                                            // tls, stream, stable, opportunistic
   2899                                                              true,   true,   true, true,
   2900                                                            srpl_datagram_callback, srpl_connected,
   2901                                                            srpl_disconnected_callback, srpl_connection_context_release,
   2902                                                            srpl_connection);
   2903     if (srpl_connection->connection == NULL) {
   2904         ADDR_NAME_LOGGER(ERROR, &srpl_connection->connected_address, "can't create connection to address ",
   2905                          " for srpl connection ", " port ", srpl_connection->name,
   2906                          srpl_connection->connected_address.sa.sa_family == AF_INET ?
   2907                          srpl_connection->connected_address.sin.sin_port: srpl_connection->connected_address.sin6.sin6_port);
   2908         return false;
   2909     }
   2910     srpl_connection->is_server = false;
   2911     ADDR_NAME_LOGGER(INFO, &srpl_connection->connected_address, "connecting to address ", " for instance ", " port ",
   2912                      srpl_connection->name, srpl_connection->connected_address.sa.sa_family == AF_INET ?
   2913                      srpl_connection->connected_address.sin.sin_port: srpl_connection->connected_address.sin6.sin6_port);
   2914     RETAIN_HERE(srpl_connection, srpl_connection); // For the connection's reference
   2915     return true;
   2916 }
   2917 
   2918 static void
   2919 srpl_instance_is_me(srpl_instance_t *instance, srpl_instance_service_t *service, const char *ifname, const addr_t *address, bool pid_match)
   2920 {
   2921     instance->is_me = true;
   2922     if (ifname != NULL) {
   2923         INFO(PUB_S_SRP "/" PUB_S_SRP ": name server for service " PRI_S_SRP " is me.", service->host_name, ifname, service->full_service_name);
   2924     } else if (address != NULL) {
   2925         ADDR_NAME_LOGGER(INFO, address, "", " service ", " is me. ", service->host_name, 0);
   2926     } else if (pid_match) {
   2927         INFO(PUB_S_SRP ": partner id %" PRIx64 "match.", instance->instance_name, instance->partner_id);
   2928     } else {
   2929         ERROR("null ifname and address, partner id doesn't match!");
   2930         return;
   2931     }
   2932 
   2933     // When we create the instance, we start an outgoing connection; when we discover that this is a connection
   2934     // to me, we can discontinue that outgoing connection.
   2935     if (instance->connection && !instance->connection->is_server) {
   2936         srpl_connection_discontinue(instance->connection);
   2937     }
   2938 }
   2939 
   2940 static bool
   2941 srpl_my_address_check(srp_server_t *server_state, const addr_t *address)
   2942 {
   2943     static interface_address_state_t *ifaddrs = NULL;
   2944     interface_address_state_t *ifa;
   2945     static time_t last_fetch = 0;
   2946     // Update the interface address list every sixty seconds, but only if we're asked to check an address.
   2947     const time_t now = srp_time();
   2948     if (last_fetch == 0 || now - last_fetch > 60) {
   2949         last_fetch = now;
   2950         ioloop_map_interface_addresses_here(server_state, &ifaddrs, NULL, NULL, NULL);
   2951     }
   2952     // See if there's a match.
   2953     for (ifa = ifaddrs; ifa; ifa = ifa->next) {
   2954         if (ip_addresses_equal(address, &ifa->addr)) {
   2955             return true;
   2956         }
   2957     }
   2958     return false;
   2959 }
   2960 
   2961 static void
   2962 srpl_instance_address_callback(void *context, addr_t *address, bool added, bool more, int err)
   2963 {
   2964     srpl_instance_service_t *service = context;
   2965     srpl_instance_t *instance = service->instance;
   2966     if (err != kDNSServiceErr_NoError) {
   2967         ERROR("service instance address resolution for " PRI_S_SRP " failed with %d", service->host_name, err);
   2968         if (service->address_query) {
   2969             address_query_cancel(service->address_query);
   2970             RELEASE_HERE(service->address_query, address_query);
   2971             service->address_query = NULL;
   2972         }
   2973         return;
   2974     }
   2975 
   2976     if (added) {
   2977         bool matched_unidentified = false;
   2978         srp_server_t *server_state = service->domain->server_state;
   2979         srpl_instance_t **up = &server_state->unmatched_instances;
   2980         while (*up != NULL) {
   2981             srpl_instance_t *unmatched_instance = *up;
   2982             srpl_connection_t *unidentified = unmatched_instance->connection;
   2983             if (unidentified == NULL) {
   2984                 up = &(*up)->next;
   2985                 continue;
   2986             }
   2987             if (unidentified->dso == NULL) {
   2988                 FAULT("unidentified instance " PRI_S_SRP " (%p) has outgoing connection (%p)!",
   2989                       unmatched_instance->instance_name, unmatched_instance, unidentified);
   2990                 srpl_connection_discontinue(unidentified);
   2991                 RELEASE_HERE(unmatched_instance->connection, srpl_connection);
   2992                 unmatched_instance->connection = NULL;
   2993                 up = &(*up)->next;
   2994                 continue;
   2995             }
   2996             if (ip_addresses_equal(address, &unidentified->connected_address)) {
   2997                 INFO("Unidentified connection " PRI_S_SRP " matches new address for instance " PRI_S_SRP,
   2998                      unidentified->dso->remote_name, instance->instance_name);
   2999                 srpl_match_unidentified_with_instance(unidentified, instance);
   3000                 matched_unidentified = true;
   3001                 break;
   3002             } else {
   3003                 if (unidentified->connected_address.sa.sa_family == AF_INET6) {
   3004                     SEGMENTED_IPv6_ADDR_GEN_SRP(&unidentified->connected_address.sin6.sin6_addr, rdata_buf);
   3005                     INFO("Unidentified connection address is: " PRI_SEGMENTED_IPv6_ADDR_SRP,
   3006                          SEGMENTED_IPv6_ADDR_PARAM_SRP(&unidentified->connected_address.sin6.sin6_addr, rdata_buf));
   3007                 } else {
   3008                     IPv4_ADDR_GEN_SRP(&unidentified->connected_address.sin.sin_addr, rdata_buf);
   3009                     INFO("Unidentified connection address is: " PRI_IPv4_ADDR_SRP,
   3010                          IPv4_ADDR_PARAM_SRP(&unidentified->connected_address.sin.sin_addr, rdata_buf));
   3011                 }
   3012                 if (address->sa.sa_family == AF_INET6) {
   3013                     SEGMENTED_IPv6_ADDR_GEN_SRP(&address->sin6.sin6_addr, rdata_buf);
   3014                     INFO("New address is: " PRI_SEGMENTED_IPv6_ADDR_SRP,
   3015                          SEGMENTED_IPv6_ADDR_PARAM_SRP(&address->sin6.sin6_addr, rdata_buf));
   3016                 } else {
   3017                     IPv4_ADDR_GEN_SRP(&address->sin.sin_addr, rdata_buf);
   3018                     INFO("New address is: " PRI_IPv4_ADDR_SRP,
   3019                          IPv4_ADDR_PARAM_SRP(&address->sin.sin_addr, rdata_buf));
   3020                 }
   3021                 INFO("Unidentified connection addr %p does not match new address for instance addr %p",
   3022                      unidentified, instance);
   3023                 INFO("Unidentified connection " PRI_S_SRP " does not match new address for instance " PRI_S_SRP,
   3024                      unidentified->dso->remote_name, instance->instance_name);
   3025                 up = &(*up)->next;
   3026             }
   3027         }
   3028 
   3029         if (srpl_my_address_check(server_state, address)) {
   3030             srpl_instance_is_me(instance, service, NULL, address, false);
   3031         } else {
   3032             instance->added_address = true;
   3033             if (matched_unidentified) {
   3034                 instance->matched_unidentified = true;
   3035             }
   3036         }
   3037     } else {
   3038         srpl_event_t event;
   3039         srpl_event_initialize(&event, srpl_event_address_remove);
   3040 
   3041         // Generate an event indicating that an address has been removed.
   3042         if (!instance->is_me) {
   3043             if (instance->connection != NULL) {
   3044                 srpl_event_deliver(instance->connection, &event);
   3045             }
   3046         }
   3047     }
   3048 
   3049     INFO(PRI_S_SRP " on instance " PRI_S_SRP ", matched_unidentified " PRI_S_SRP ", added_address " PRI_S_SRP,
   3050          more ? "more coming" : "done", instance->instance_name, instance->matched_unidentified ? "yes" : "no",
   3051          instance->added_address ? "yes" : "no");
   3052     if (!more) {
   3053         if (instance->added_address && !instance->matched_unidentified) {
   3054             // Generate an event indicating that we have a new address.
   3055             if (instance->connection != NULL && !instance->connection->is_server) {
   3056                 srpl_event_t event;
   3057                 srpl_event_initialize(&event, srpl_event_address_add);
   3058                 srpl_event_deliver(instance->connection, &event);
   3059             }
   3060         }
   3061         // reset added_address and matched_unidentified
   3062         instance->added_address = false;
   3063         instance->matched_unidentified = false;
   3064     }
   3065 }
   3066 
   3067 static void
   3068 srpl_abandon_nonpreferred_dataset(srpl_domain_t *NONNULL domain)
   3069 {
   3070     srpl_instance_t *instance, *next;
   3071     for (instance = domain->instances; instance != NULL; instance = next) {
   3072         next = instance->next;
   3073         if (instance->have_dataset_id) {
   3074             if (srpl_dataset_id_compare(domain->dataset_id, instance->dataset_id) > 0) {
   3075                 instance->sync_to_join = false;
   3076                 if (instance->connection != NULL) {
   3077                     INFO("abandon dataset with instance " PRI_S_SRP " of partner id %" PRIx64,
   3078                          instance->instance_name, instance->partner_id);
   3079                     srpl_connection_reset(instance->connection);
   3080                     srpl_connection_next_state(instance->connection, srpl_state_disconnected);
   3081                 }
   3082             }
   3083         }
   3084     }
   3085 }
   3086 
   3087 static void srpl_transition_to_startup_state(srpl_domain_t *domain);
   3088 
   3089 static int
   3090 srpl_dataset_id_compare(uint64_t id1, uint64_t id2)
   3091 {
   3092     int64_t distance = id1 - id2;
   3093     if (distance == 0) {
   3094         return 0;
   3095     } else if (distance > 0) {
   3096         return 1;
   3097     } else if (distance == EQUI_DISTANCE64 && (int64_t)id1 > (int64_t)id2) {
   3098         // the number 2^(N1) (where N is 64) is equidistant in both directions in sequence number terms.
   3099         // they are both considered to be "less than" each other. This is true for any serial number with
   3100         // distance of 0x8000000000000000 between them. To break the tie, higher signed number wins.
   3101         return 1;
   3102     } else {
   3103         return -1;
   3104     }
   3105 }
   3106 
   3107 static bool
   3108 srpl_instances_max_dataset_id(srpl_domain_t *domain, uint64_t *dataset_id)
   3109 {
   3110     srpl_instance_t *instance = NULL;
   3111     bool have_max = false;
   3112     uint64_t max = 0;
   3113 
   3114     for (instance = domain->instances; instance != NULL; instance = instance->next) {
   3115         if (instance->sync_fail || instance->discontinuing || instance->version_mismatch) {
   3116             continue;
   3117         }
   3118         if (!have_max) {
   3119             max = instance->dataset_id;
   3120             have_max = true;
   3121         } else {
   3122             if (srpl_dataset_id_compare(instance->dataset_id, max) > 0) {
   3123                 max = instance->dataset_id;
   3124             }
   3125         }
   3126     }
   3127     if (!have_max) {
   3128         INFO("no available dataset id.");
   3129     } else {
   3130         *dataset_id = max;
   3131     }
   3132     return have_max;
   3133 }
   3134 
   3135 static void
   3136 srpl_instance_add(const char *hostname, const char *service_name,
   3137                   const char *ifname, srpl_domain_t *domain, srpl_instance_service_t *service,
   3138                   bool have_partner_id, uint64_t advertised_partner_id,
   3139                   bool have_dataset_id, uint64_t advertised_dataset_id,
   3140                   bool have_priority, uint32_t advertised_priority)
   3141 {
   3142     srpl_instance_t **sp, *instance;
   3143     srpl_instance_service_t **hp;
   3144 
   3145     // Find the service on the instance list for this domain.
   3146     for (instance = domain->instances; instance != NULL; instance = instance->next) {
   3147         for (hp = &instance->services; *hp != NULL; hp = &(*hp)->next) {
   3148             if (service == *hp) {
   3149                 INFO("service " PRI_S_SRP " is found with instance " PRI_S_SRP, service_name, instance->instance_name);
   3150                 break;
   3151             }
   3152         }
   3153         if (*hp != NULL) {
   3154             break;
   3155         }
   3156     }
   3157 
   3158     if (instance == NULL) {
   3159         INFO("service " PRI_S_SRP " for " PRI_S_SRP "/" PUB_S_SRP " " PUB_S_SRP
   3160              "id %" PRIx64 " " PUB_S_SRP "did %" PRIx64 " not on list",
   3161              service_name != NULL ? service_name : "<NULL>", hostname, ifname,
   3162              have_partner_id ? "" : "!", advertised_partner_id,
   3163              have_dataset_id ? "" : "!", advertised_dataset_id);
   3164         // Look for the instance with the same partner id
   3165         for (sp = &domain->instances; *sp != NULL; sp = &(*sp)->next) {
   3166             instance = *sp;
   3167             if (instance->have_partner_id && instance->partner_id == advertised_partner_id) {
   3168                 INFO("instance " PRI_S_SRP " has matched partner_id %" PRIx64, instance->instance_name, instance->partner_id);
   3169                 break;
   3170             }
   3171         }
   3172 
   3173         if (*sp == NULL) {
   3174             // We don't have the instance to the remote partner yet, create one
   3175             instance = calloc(1, sizeof(*instance));
   3176             if (instance == NULL) {
   3177                 ERROR("no memory to create instance for service " PRI_S_SRP, service_name);
   3178                 RELEASE_HERE(service, srpl_instance_service);
   3179                 return;
   3180             }
   3181             // Retain for the instance list on the domain
   3182             RETAIN_HERE(instance, srpl_instance);
   3183             instance->domain = domain;
   3184             RETAIN_HERE(instance->domain, srpl_domain);
   3185             instance->services = service;
   3186             *sp = instance;
   3187             INFO("create a new instance for service " PRI_S_SRP, service_name);
   3188         } else {
   3189             for (hp = &instance->services; *hp != NULL; hp = &(*hp)->next)
   3190                 ;
   3191             *hp = service;
   3192             INFO("instance " PRI_S_SRP " exists; just link the service " PRI_S_SRP, instance->instance_name, service_name);
   3193         }
   3194         // Retain service for the instance service list
   3195         RETAIN_HERE(service, srpl_instance_service);
   3196         // Retain instance for service
   3197         RETAIN_HERE(instance, srpl_instance);
   3198         service->instance = instance;
   3199     }
   3200     // take the host name of the remote partner as the instance name
   3201     char *pch = strchr(service_name, '.');
   3202     if (instance->instance_name == NULL || strncmp(instance->instance_name, service_name, pch - service_name)) {
   3203         char partner_name[kDNSServiceMaxDomainName];
   3204         memcpy(partner_name, service_name, pch - service_name);
   3205         partner_name[pch - service_name] = '\0';
   3206         char *new_partner_name = strdup(partner_name);
   3207         if (new_partner_name == NULL) {
   3208             ERROR("no memory for instance name.");
   3209             return;
   3210         } else {
   3211             INFO("instance name changed from " PRI_S_SRP " to " PRI_S_SRP, instance->instance_name ?
   3212                  instance->instance_name : "NULL", new_partner_name);
   3213             free(instance->instance_name);
   3214             instance->instance_name = new_partner_name;
   3215         }
   3216     }
   3217     bool some_id_updated = false;
   3218     if (have_dataset_id && (!instance->have_dataset_id || instance->dataset_id != advertised_dataset_id)) {
   3219         some_id_updated = true;
   3220         instance->have_dataset_id = true;
   3221         instance->dataset_id = advertised_dataset_id;
   3222         INFO("update instance " PRI_S_SRP " dataset_id %" PRIx64 " from service " PRI_S_SRP,
   3223              instance->instance_name, instance->dataset_id, service_name);
   3224     }
   3225 
   3226     // If this add changed the partner ID, we may want to re-attempt a connect.
   3227     if (have_partner_id && (!instance->have_partner_id || instance->partner_id != advertised_partner_id)) {
   3228         some_id_updated = true;
   3229         instance->have_partner_id = true;
   3230         instance->partner_id = advertised_partner_id;
   3231         INFO("instance " PRI_S_SRP " update partner_id to %" PRIx64, instance->instance_name, advertised_partner_id);
   3232     }
   3233 
   3234     // If this add changed the priority, we may want to re-attempt a connect.
   3235     if (have_priority && (!instance->have_priority || instance->priority != advertised_priority)) {
   3236         some_id_updated = true;
   3237         instance->have_priority = true;
   3238         instance->priority = advertised_priority;
   3239         INFO("instance " PRI_S_SRP " update priority to %" PRIx32, instance->instance_name, advertised_priority);
   3240     }
   3241     // To join the replication, sync with remote partners that are discovered during the
   3242     // discovery window.
   3243     if (domain->partner_discovery_pending) {
   3244          instance->discovered_in_window = true;
   3245      }
   3246 
   3247     // If the hostname changed, we need to restart the address query.
   3248     if (service->host_name == NULL || strcmp(service->host_name, hostname)) {
   3249         if (service->address_query != NULL) {
   3250             address_query_cancel(service->address_query);
   3251             RELEASE_HERE(service->address_query, address_query);
   3252             service->address_query = NULL;
   3253         }
   3254 
   3255         if (service->host_name != NULL) {
   3256             INFO("name server name change from " PRI_S_SRP " to " PRI_S_SRP " for " PRI_S_SRP "/" PUB_S_SRP " in domain " PRI_S_SRP,
   3257                  service->host_name, hostname, service_name == NULL ? "<NULL>" : service_name, ifname, domain->name);
   3258         } else {
   3259             INFO("new name server " PRI_S_SRP " for " PRI_S_SRP "/" PUB_S_SRP " in domain " PRI_S_SRP,
   3260                  hostname, service_name == NULL ? "<NULL>" : service_name, ifname, domain->name);
   3261         }
   3262 
   3263         char *new_name = strdup(hostname);
   3264         if (new_name == NULL) {
   3265             // This should never happen, and if it does there's actually no clean way to recover from it.  This approach
   3266             // will result in no crash, and since we don't start an address query in this case, we will just wind up in
   3267             // a quiescent state for this replication peer until something changes.
   3268             ERROR("no memory for service name.");
   3269             return;
   3270         } else {
   3271             free(service->host_name);
   3272             service->host_name = new_name;
   3273         }
   3274         // The instance may be connected. It's possible its IP address hasn't changed. If it has changed, we should
   3275         // get a disconnect due to a connection timeout or (if something else got the same address, a reset) if for
   3276         // no other reason, and then we'll try to reconnect, so this should be harmless.
   3277     }
   3278 
   3279     // The address query can be NULL either because we only just created the instance, or because the instance name changed (e.g.
   3280     // as the result of a hostname conflict).
   3281     if (service->address_query == NULL) {
   3282         service->address_query = address_query_create(service->host_name, service,
   3283                                                       srpl_instance_address_callback,
   3284                                                       srpl_instance_service_context_release);
   3285         if (service->address_query == NULL) {
   3286             INFO("unable to create address query");
   3287         } else {
   3288             RETAIN_HERE(service, srpl_instance_service); // retain for the address query.
   3289         }
   3290     }
   3291 
   3292     if (instance->have_partner_id &&
   3293         domain->partner_id  == instance->partner_id)
   3294     {
   3295         srpl_instance_is_me(instance, service, NULL, NULL, true);
   3296     }
   3297 
   3298     // If there's no existing connection, the partner initiates an outgoing connection if
   3299     // it is in the startup state or its partner id is greater than the remote partner id.
   3300     if (!instance->is_me && ((instance->connection == NULL || some_id_updated) &&
   3301         (domain->srpl_opstate == SRPL_OPSTATE_STARTUP ||
   3302          domain->partner_id > advertised_partner_id)))
   3303     {
   3304         char msg_buf[256];
   3305         if (domain->srpl_opstate == SRPL_OPSTATE_STARTUP) {
   3306             snprintf(msg_buf, sizeof(msg_buf), "I am in startup state");
   3307         } else {
   3308             snprintf(msg_buf, sizeof(msg_buf), "local partner_id %" PRIx64
   3309                      " greater than remote partner_id %" PRIx64, domain->partner_id,
   3310                      advertised_partner_id);
   3311         }
   3312         INFO("making outgoing connection on instance " PRI_S_SRP " (partner_id: %" PRIx64 ") since " PUB_S_SRP,
   3313              instance->instance_name, instance->partner_id, msg_buf);
   3314         if (instance->connection != NULL && instance->connection->connection != NULL) {
   3315             srpl_trigger_disconnect(instance->connection);
   3316             srpl_connection_next_state(instance->connection, srpl_state_idle);
   3317         } else {
   3318             srpl_instance_reconnect(instance);
   3319         }
   3320     } else {
   3321         INFO(PRI_S_SRP ": not making outgoing connection: " PUB_S_SRP "is_me, connection = %p, " PRI_S_SRP
   3322              "some_id_updated, local partner_id %" PRIx64 ", remote partner_id %" PRIx64,
   3323              instance->instance_name, instance->is_me ? "" : "!", instance->connection, some_id_updated ? "" : "!",
   3324              domain->partner_id, advertised_partner_id);
   3325         // it's not our job to connect, but since there's some id change, we'd disconnect
   3326         // the current connection and trigger the peer to reconnect.
   3327         if (some_id_updated && instance->connection != NULL &&
   3328             instance->connection->connection != NULL)
   3329         {
   3330             INFO("some id updated, disconnect the current connection.");
   3331             srpl_trigger_disconnect(instance->connection);
   3332             srpl_connection_next_state(instance->connection, srpl_state_idle);
   3333         }
   3334     }
   3335 }
   3336 
   3337 static void
   3338 srpl_resolve_callback(srpl_instance_service_t *service)
   3339 {
   3340     char ifname[IFNAMSIZ];
   3341     srpl_domain_t *domain = service->domain;
   3342     const char *domain_name;
   3343     uint8_t domain_len;
   3344     const char *partner_id_string;
   3345     const char *dataset_id_string;
   3346     const char *xpanid_string;
   3347     const char *priority_string;
   3348     uint8_t partner_id_string_len;
   3349     uint8_t dataset_id_string_len;
   3350     uint8_t xpanid_string_len;
   3351     uint8_t priority_string_len;
   3352     char partner_id_buf[INT64_HEX_STRING_MAX];
   3353     char dataset_id_buf[INT64_HEX_STRING_MAX];
   3354     char xpanid_buf[INT64_HEX_STRING_MAX];
   3355     char priority_buf[INT64_HEX_STRING_MAX];
   3356     uint64_t advertised_partner_id = 0;
   3357     bool have_partner_id = false;
   3358     uint64_t advertised_dataset_id = 0;
   3359     bool have_dataset_id = false;
   3360     bool have_priority = false;
   3361     uint16_t advertised_priority = 0;
   3362     srpl_instance_service_t **sp;
   3363     srp_server_t *server_state = domain->server_state;
   3364 
   3365     // These are just used to do the "satisfied" check--we can tell that we have these records from the rdata pointers.
   3366     service->have_txt_record = service->have_srv_record = false;
   3367 
   3368     // In case we later determine that the data we got is stale, this flag indicates that it's okay to try to
   3369     // reconfirm it.
   3370     service->got_new_info = true;
   3371 
   3372     if (service->txt_rdata == NULL) {
   3373         INFO(PRI_S_SRP ": service update with no TXT record--skipping", service->full_service_name);
   3374         return;
   3375     }
   3376     if (service->srv_rdata == NULL) {
   3377         INFO(PRI_S_SRP ": service update with no SRV record--skipping", service->full_service_name);
   3378         return;
   3379     }
   3380 
   3381     domain_name = TXTRecordGetValuePtr(service->txt_length, service->txt_rdata, "dn", &domain_len);
   3382     if (domain_name == NULL) {
   3383         INFO("resolve for " PRI_S_SRP " succeeded, but there is no domain name.", service->full_service_name);
   3384         return;
   3385     }
   3386 
   3387     if (domain_len != strlen(domain->name) || memcmp(domain_name, domain->name, domain_len)) {
   3388         const char *domain_print;
   3389         char *domain_terminated = malloc(domain_len + 1);
   3390         if (domain_terminated == NULL) {
   3391             domain_print = "<no memory for domain name>";
   3392         } else {
   3393             memcpy(domain_terminated, domain_name, domain_len);
   3394             domain_terminated[domain_len] = 0;
   3395             domain_print = domain_terminated;
   3396         }
   3397         INFO("domain (" PRI_S_SRP ") for " PRI_S_SRP " doesn't match expected domain " PRI_S_SRP,
   3398              domain_print, service->full_service_name, domain->name);
   3399         free(domain_terminated);
   3400         return;
   3401     }
   3402 
   3403     if (server_state->current_thread_domain_name == NULL) {
   3404         FAULT("current_thread_domain_name is NULL.");
   3405         return;
   3406     }
   3407 
   3408     if (strcmp(domain->name, server_state->current_thread_domain_name)) {
   3409         INFO("discovered srpl instance is not for current thread domain, so not setting up replication.");
   3410         return;
   3411     }
   3412 
   3413     INFO("server " PRI_S_SRP " for " PRI_S_SRP, service->full_service_name, domain->name);
   3414 
   3415     // Make sure it's for our mesh.
   3416     snprintf(xpanid_buf, sizeof(xpanid_buf), "%" PRIx64, server_state->xpanid);
   3417     xpanid_string = TXTRecordGetValuePtr(service->txt_length, service->txt_rdata, "xpanid", &xpanid_string_len);
   3418     if (xpanid_string == NULL ||
   3419         (xpanid_string_len != strlen(xpanid_buf) || memcmp(xpanid_buf, xpanid_string, xpanid_string_len)))
   3420     {
   3421         char other_xpanid_buf[INT64_HEX_STRING_MAX];
   3422         if (xpanid_string_len >= sizeof(other_xpanid_buf)) {
   3423             xpanid_string_len = sizeof(other_xpanid_buf) - 1;
   3424         }
   3425         if (xpanid_string == NULL) {
   3426             const char none[] = "(none)";
   3427             memcpy(other_xpanid_buf, none, sizeof(none));
   3428         } else {
   3429             memcpy(other_xpanid_buf, xpanid_string, xpanid_string_len);
   3430             other_xpanid_buf[xpanid_string_len] = 0;
   3431         }
   3432         INFO("discovered srpl instance is not for xpanid " PRI_S_SRP ", not " PRI_S_SRP
   3433              " so not setting up replication.", xpanid_buf, other_xpanid_buf);
   3434         return;
   3435     }
   3436 
   3437     partner_id_string = TXTRecordGetValuePtr(service->txt_length, service->txt_rdata, "pid", &partner_id_string_len);
   3438     if (partner_id_string != NULL && partner_id_string_len < INT64_HEX_STRING_MAX) {
   3439         char *endptr, *nulptr;
   3440         unsigned long long num;
   3441         memcpy(partner_id_buf, partner_id_string, partner_id_string_len);
   3442         nulptr = &partner_id_buf[partner_id_string_len];
   3443         *nulptr = '\0';
   3444         num = strtoull(partner_id_buf, &endptr, 16);
   3445         // On current architectures, unsigned long long and uint64_t are the same size, but we should have a check here
   3446         // just in case, because the standard doesn't guarantee that this will be true.
   3447         // If endptr == nulptr, that means we converted the entire buffer and didn't run into a NUL in the middle of it
   3448         // somewhere.
   3449         if (num < UINT64_MAX && endptr == nulptr) {
   3450             advertised_partner_id = num;
   3451             have_partner_id = true;
   3452         }
   3453     }
   3454 
   3455     dataset_id_string = TXTRecordGetValuePtr(service->txt_length, service->txt_rdata, "did", &dataset_id_string_len);
   3456     if (dataset_id_string != NULL && dataset_id_string_len < INT64_HEX_STRING_MAX) {
   3457         char *endptr, *nulptr;
   3458         unsigned long long num;
   3459         memcpy(dataset_id_buf, dataset_id_string, dataset_id_string_len);
   3460         nulptr = &dataset_id_buf[dataset_id_string_len];
   3461         *nulptr = '\0';
   3462         num = strtoull(dataset_id_buf, &endptr, 16);
   3463         if (num < UINT64_MAX && endptr == nulptr) {
   3464             advertised_dataset_id = num;
   3465             have_dataset_id = true;
   3466         }
   3467     }
   3468 
   3469     priority_string = TXTRecordGetValuePtr(service->txt_length, service->txt_rdata, "priority", &priority_string_len);
   3470     if (priority_string != NULL && priority_string_len < INT64_HEX_STRING_MAX) {
   3471         char *endptr, *nulptr;
   3472         unsigned long long num;
   3473         memcpy(priority_buf, priority_string, priority_string_len);
   3474         nulptr = &priority_buf[priority_string_len];
   3475         *nulptr = '\0';
   3476         num = strtoull(priority_buf, &endptr, 16);
   3477         if (num < UINT64_MAX && endptr == nulptr) {
   3478             advertised_priority = num;
   3479             have_priority = true;
   3480         }
   3481     }
   3482 
   3483     dns_rr_t srv_record;
   3484     unsigned offset = 0;
   3485     memset(&srv_record, 0, sizeof(srv_record));
   3486     srv_record.type = dns_rrtype_srv;
   3487     if (!dns_rdata_parse_data(&srv_record, service->srv_rdata, &offset, offset + service->srv_length, service->srv_length, 0)) {
   3488         ERROR(PRI_S_SRP ": unable to parse srv record", service->full_service_name);
   3489         return;
   3490     }
   3491 
   3492     service->outgoing_port = srv_record.data.srv.port;
   3493 
   3494     if (if_indextoname(service->ifindex, ifname) == NULL) {
   3495         snprintf(ifname, sizeof(ifname), "%d", service->ifindex);
   3496     }
   3497 
   3498     char namebuf[DNS_MAX_NAME_SIZE_ESCAPED + 1];
   3499     dns_name_print(srv_record.data.srv.name, namebuf, sizeof(namebuf));
   3500     dns_name_free(srv_record.data.srv.name);
   3501 
   3502     srpl_instance_add(namebuf, service->full_service_name, ifname, service->domain, service, have_partner_id,
   3503                       advertised_partner_id, have_dataset_id, advertised_dataset_id, have_priority,
   3504                       advertised_priority);
   3505 
   3506     // After the service is associated with a resolved instance, we should take it off the unresolved
   3507     // list if the service is still on it. If the service fails to assocaited with an instance because
   3508     // for example, the resolve shows a service that does not include required data, we should still
   3509     // keep the service on the unresolved list. Later on when we get an expected resolve, the service
   3510     // can be moved to the associated list. This guarantees that a service at a time has to be on either
   3511     // unresolved or associated list.
   3512     for (sp = &domain->unresolved_services; *sp; sp = &(*sp)->next) {
   3513         if (*sp == service) {
   3514             *sp = service->next;
   3515             service->next = NULL;
   3516             RELEASE_HERE(service, srpl_instance_service);
   3517             break;
   3518         }
   3519     }
   3520 }
   3521 
   3522 static void
   3523 srpl_instance_service_newdata_timeout(void *context)
   3524 {
   3525     srpl_instance_service_t *service = context;
   3526     srpl_resolve_callback(service);
   3527 }
   3528 
   3529 static void
   3530 srpl_instance_service_satisfied_check(srpl_instance_service_t *service)
   3531 {
   3532     if (service->have_srv_record && service->have_txt_record) {
   3533         if (service->resolve_wakeup != NULL) {
   3534             ioloop_cancel_wake_event(service->resolve_wakeup);
   3535         }
   3536         srpl_resolve_callback(service);
   3537         return;
   3538     }
   3539     INFO(PRI_S_SRP ": not satisfied, waiting.", service->full_service_name);
   3540     if (service->resolve_wakeup == NULL) {
   3541         service->resolve_wakeup = ioloop_wakeup_create();
   3542         if (service->resolve_wakeup == NULL) {
   3543             ERROR(PRI_S_SRP ": unable to allocate resolve wakeup", service->full_service_name);
   3544             return;
   3545         }
   3546     }
   3547     ioloop_add_wake_event(service->resolve_wakeup, service, srpl_instance_service_newdata_timeout,
   3548                           srpl_instance_service_context_release, 1000); // max one second
   3549     RETAIN_HERE(service, srpl_instance_service); // for the wakeup
   3550 }
   3551 
   3552 static void
   3553 srpl_service_txt_callback(DNSServiceRef UNUSED sdRef, DNSServiceFlags UNUSED flags, uint32_t UNUSED interfaceIndex,
   3554                           DNSServiceErrorType errorCode, const char *fullname, uint16_t UNUSED rrtype, uint16_t UNUSED rrclass,
   3555                           uint16_t rdlen, const void *rdata, uint32_t UNUSED ttl, void *context)
   3556 {
   3557     srpl_instance_service_t *service = context;
   3558     if (errorCode != kDNSServiceErr_NoError) {
   3559         ERROR("txt resolve for " PRI_S_SRP " failed with %d", fullname, errorCode);
   3560         if (service->txt_txn != NULL) {
   3561             ioloop_dnssd_txn_cancel(service->txt_txn);
   3562             ioloop_dnssd_txn_release(service->txt_txn);
   3563             service->txt_txn = NULL;
   3564         }
   3565         return;
   3566     }
   3567 
   3568     free(service->txt_rdata);
   3569     if (!(flags & kDNSServiceFlagsAdd)) {
   3570         INFO("TXT record for " PRI_S_SRP " went away", service->full_service_name);
   3571         service->txt_rdata = NULL;
   3572         service->txt_length = 0;
   3573         service->have_txt_record = false;
   3574         return;
   3575     }
   3576     service->txt_rdata = malloc(rdlen);
   3577     if (service->txt_rdata == NULL) {
   3578         ERROR("unable to save txt rdata for " PRI_S_SRP, service->full_service_name);
   3579         return;
   3580     }
   3581     memcpy(service->txt_rdata, rdata, rdlen);
   3582     service->txt_length = rdlen;
   3583     service->have_txt_record = true;
   3584     srpl_instance_service_satisfied_check(service);
   3585 }
   3586 
   3587 static void
   3588 srpl_service_srv_callback(DNSServiceRef UNUSED sdRef, DNSServiceFlags flags, uint32_t UNUSED interfaceIndex,
   3589                           DNSServiceErrorType errorCode, const char *UNUSED fullname, uint16_t UNUSED rrtype, uint16_t UNUSED rrclass,
   3590                           uint16_t rdlen, const void *rdata, uint32_t UNUSED ttl, void *context)
   3591 {
   3592     srpl_instance_service_t *service = context;
   3593     if (errorCode != kDNSServiceErr_NoError) {
   3594         ERROR("srv resolve for " PRI_S_SRP " failed with %d", fullname, errorCode);
   3595         if (service->srv_txn != NULL) {
   3596             ioloop_dnssd_txn_cancel(service->srv_txn);
   3597             ioloop_dnssd_txn_release(service->srv_txn);
   3598             service->srv_txn = NULL;
   3599         }
   3600         return;
   3601     }
   3602 
   3603     free(service->srv_rdata);
   3604     if (!(flags & kDNSServiceFlagsAdd)) {
   3605         INFO("SRV record for " PRI_S_SRP " went away", service->full_service_name);
   3606         service->srv_rdata = NULL;
   3607         service->srv_length = 0;
   3608         service->have_srv_record = false;
   3609         return;
   3610     }
   3611     service->srv_rdata = malloc(rdlen);
   3612     if (service->srv_rdata == NULL) {
   3613         ERROR("unable to save srv rdata for " PRI_S_SRP, service->full_service_name);
   3614         return;
   3615     }
   3616     memcpy(service->srv_rdata, rdata, rdlen);
   3617     service->srv_length = rdlen;
   3618     service->have_srv_record = true;
   3619     srpl_instance_service_satisfied_check(service);
   3620 }
   3621 
   3622 static void
   3623 srpl_browse_restart(void *context)
   3624 {
   3625     srpl_domain_t *domain = context;
   3626     ERROR("restarting browse on domain " PRI_S_SRP, domain->name);
   3627     srpl_domain_browse_start(domain);
   3628 }
   3629 
   3630 static bool
   3631 srpl_service_instance_query_start(srpl_instance_service_t *service, dnssd_txn_t **txn, const char *rrtype_name,
   3632                                   uint16_t rrtype, uint16_t qclass, DNSServiceQueryRecordReply callback)
   3633 {
   3634     DNSServiceRef sdref;
   3635 
   3636     int err = DNSServiceQueryRecord(&sdref, kDNSServiceFlagsLongLivedQuery, kDNSServiceInterfaceIndexAny,
   3637                                     service->full_service_name,  rrtype, qclass, callback, service);
   3638     if (err != kDNSServiceErr_NoError) {
   3639         ERROR("unable to resolve " PUB_S_SRP " record for " PRI_S_SRP ": code %d",
   3640               rrtype_name, service->full_service_name, err);
   3641         return false;
   3642     }
   3643     *txn = ioloop_dnssd_txn_add(sdref, service, srpl_instance_service_context_release, NULL);
   3644     if (*txn == NULL) {
   3645         ERROR("unable to allocate dnssd_txn_t for " PUB_S_SRP " record for " PRI_S_SRP,
   3646               rrtype_name, service->full_service_name);
   3647         DNSServiceRefDeallocate(sdref);
   3648         return false;
   3649     }
   3650     // Retain for the dnssd_txn.
   3651     RETAIN_HERE(service, srpl_instance_service);
   3652     return true;
   3653 }
   3654 
   3655 static void
   3656 srpl_domain_context_release(void *context)
   3657 {
   3658     srpl_domain_t *domain = context;
   3659     RELEASE_HERE(domain, srpl_domain);
   3660 }
   3661 
   3662 static void
   3663 srpl_browse_callback(DNSServiceRef UNUSED sdRef, DNSServiceFlags flags, uint32_t interfaceIndex,
   3664                      DNSServiceErrorType errorCode, const char *serviceName, const char *regtype,
   3665                      const char *replyDomain, void *context)
   3666 {
   3667     srpl_domain_t *domain = context;
   3668     if (errorCode != kDNSServiceErr_NoError) {
   3669         ERROR("browse on domain " PRI_S_SRP " failed with %d", domain->name, errorCode);
   3670         if (domain->query != NULL) {
   3671             ioloop_dnssd_txn_cancel(domain->query);
   3672             ioloop_dnssd_txn_release(domain->query);
   3673             domain->query = NULL;
   3674         }
   3675 
   3676         // Get rid of all instances on the domain, because we aren't going to get remove events for them.
   3677         // If we start a new browse and get add events while the connections are still up, this will
   3678         // have no effect.
   3679         srpl_instance_t *next_instance;
   3680         for (srpl_instance_t *instance = domain->instances; instance; instance = next_instance) {
   3681             INFO("_srpl-tls._tcp instance " PRI_S_SRP " went away.", instance->instance_name);
   3682             next_instance = instance->next;
   3683             srpl_instance_discontinue(instance);
   3684         }
   3685 
   3686         srpl_instance_service_t *service, *next;
   3687         for (service = domain->unresolved_services; service != NULL; service = next) {
   3688             INFO("discontinue unresolved service " PRI_S_SRP, service->full_service_name);
   3689             next = service->next;
   3690             service->num_copies = 0;
   3691             srpl_instance_service_discontinue(service);
   3692         }
   3693 
   3694         if (domain->server_state->srpl_browse_wakeup == NULL) {
   3695             domain->server_state->srpl_browse_wakeup = ioloop_wakeup_create();
   3696         }
   3697         if (domain->server_state->srpl_browse_wakeup != NULL) {
   3698             ioloop_add_wake_event(domain->server_state->srpl_browse_wakeup,
   3699                                   domain, srpl_browse_restart, srpl_domain_context_release, 1000);
   3700             RETAIN_HERE(domain, srpl_domain);
   3701         }
   3702         return;
   3703     }
   3704 
   3705     char full_service_name[kDNSServiceMaxDomainName];
   3706     DNSServiceConstructFullName(full_service_name, serviceName, regtype, replyDomain);
   3707 
   3708     srpl_instance_t *instance;
   3709     srpl_instance_service_t *service;
   3710     // See if we already have a service record going; First search in unresolved_services list which
   3711     // contains the services that haven't been resolved yet.
   3712     for (service = domain->unresolved_services; service; service = service->next) {
   3713         if (!strcmp(service->full_service_name, full_service_name)) {
   3714             break;
   3715         }
   3716     }
   3717     // If the service is not found in unresolved_services list, search in instance list which contains services
   3718     // that have been resolved.
   3719     if (service == NULL) {
   3720         for (instance = domain->instances; instance; instance = instance->next) {
   3721             for (service = instance->services; service; service = service->next) {
   3722                 if (!strcmp(service->full_service_name, full_service_name)) {
   3723                    break;
   3724                 }
   3725             }
   3726             if (service != NULL) {
   3727                 break;
   3728             }
   3729         }
   3730     }
   3731     if (flags & kDNSServiceFlagsAdd) {
   3732         if (service != NULL) {
   3733             // it's possible that a service goes away and starts discontinuing, and before the timeout,
   3734             // the service comes back again. In this case, since the service is still on the list, it
   3735             // appears as a duplicate add. But we should cancel the discontinue timer.
   3736             if (service->discontinue_timeout != NULL) {
   3737                 if (service->discontinuing) {
   3738                     INFO("discontinue on service " PRI_S_SRP " canceled.", service->full_service_name);
   3739                     ioloop_cancel_wake_event(service->discontinue_timeout);
   3740                     service->discontinuing = false;
   3741                     if (service->instance != NULL) {
   3742                         service->instance->discontinuing = false;
   3743                     }
   3744                 }
   3745             }
   3746 
   3747             if (service->resolve_started) {
   3748                 INFO(PRI_S_SRP ": resolve_started true, incrementing num_copies to %d",
   3749                      full_service_name, service->num_copies + 1);
   3750                 service->num_copies++;
   3751                 INFO("duplicate add for service " PRI_S_SRP, full_service_name);
   3752                 return;
   3753              }
   3754              // In this case the service went away and came back, so service->resolve_started is false, but the
   3755              // instance still exists.
   3756              INFO(PRI_S_SRP ": resolve_started false, incrementing num_copies to %d",
   3757                   full_service_name, service->num_copies + 1);
   3758              service->num_copies++;
   3759              INFO("service " PRI_S_SRP " went away but came back.", full_service_name);
   3760         } else {
   3761             service = calloc(1, sizeof(*service));
   3762             if (service == NULL) {
   3763                 ERROR("no memory for service " PRI_S_SRP, full_service_name);
   3764                 return;
   3765             }
   3766             // Retain for unresolved_services list
   3767             RETAIN_HERE(service, srpl_instance_service);
   3768             service->domain = domain;
   3769             RETAIN_HERE(service->domain, srpl_domain);
   3770 
   3771             service->full_service_name = strdup(full_service_name);
   3772             if (service->full_service_name == NULL) {
   3773                 ERROR("no memory for service name " PRI_S_SRP, full_service_name);
   3774                 RELEASE_HERE(service, srpl_instance_service);
   3775                 return;
   3776             }
   3777             INFO(PRI_S_SRP ": new service, setting num_copies to 1", full_service_name);
   3778             service->num_copies = 1;
   3779             service->ifindex = interfaceIndex;
   3780             // add to the unresolved service list
   3781             srpl_instance_service_t **sp;
   3782             for (sp = &domain->unresolved_services; *sp != NULL; sp = &(*sp)->next) {;}
   3783             *sp = service;
   3784 
   3785             dns_towire_state_t towire;
   3786             uint8_t name_buffer[kDNSServiceMaxDomainName];
   3787             memset(&towire, 0, sizeof(towire));
   3788             towire.p = name_buffer;
   3789             towire.lim = towire.p + kDNSServiceMaxDomainName;
   3790             dns_full_name_to_wire(NULL, &towire, full_service_name);
   3791 
   3792             free(service->ptr_rdata);
   3793             service->ptr_length = towire.p - name_buffer;
   3794             service->ptr_rdata = malloc(service->ptr_length);
   3795             if (service->ptr_rdata == NULL) {
   3796                 ERROR("unable to save PTR rdata for " PRI_S_SRP, full_service_name);
   3797                 return;
   3798             }
   3799             memcpy(service->ptr_rdata, name_buffer, service->ptr_length);
   3800         }
   3801 
   3802         if (!srpl_service_instance_query_start(service, &service->txt_txn, "TXT", dns_rrtype_txt, dns_qclass_in,
   3803                                                srpl_service_txt_callback) ||
   3804             !srpl_service_instance_query_start(service, &service->srv_txn, "SRV", dns_rrtype_srv, dns_qclass_in,
   3805                                                srpl_service_srv_callback))
   3806         {
   3807             return;
   3808         }
   3809         INFO("resolving " PRI_S_SRP, full_service_name);
   3810         service->resolve_started = true;
   3811     } else {
   3812         if (service != NULL) {
   3813             INFO(PRI_S_SRP ": decrementing num_copies to %d", full_service_name, service->num_copies - 1);
   3814             service->num_copies--;
   3815             if (service->num_copies < 0) {
   3816                 FAULT("num_copies went negative");
   3817                 service->num_copies = 0;
   3818             }
   3819             if (service->num_copies == 0) {
   3820                 INFO("discontinuing service " PRI_S_SRP, full_service_name);
   3821                 srpl_instance_service_discontinue(service);
   3822                 return;
   3823             }
   3824         }
   3825     }
   3826 }
   3827 
   3828 static void
   3829 srpl_dnssd_txn_fail(void *context, int err)
   3830 {
   3831     srpl_domain_t *domain = context;
   3832     ERROR("service browse " PRI_S_SRP " i/o failure: %d", domain->name, err);
   3833 }
   3834 
   3835 static bool
   3836 srpl_domain_browse_start(srpl_domain_t *domain)
   3837 {
   3838     int ret;
   3839     DNSServiceRef sdref;
   3840 
   3841     INFO("starting browse on _srpl-tls._tcp");
   3842     // Look for an NS record for the specified domain using mDNS, not DNS.
   3843     ret = DNSServiceBrowse(&sdref, kDNSServiceFlagsLongLivedQuery,
   3844                            kDNSServiceInterfaceIndexAny, "_srpl-tls._tcp", NULL, srpl_browse_callback, domain);
   3845     if (ret != kDNSServiceErr_NoError) {
   3846         ERROR("Unable to query for NS records for " PRI_S_SRP, domain->name);
   3847         return false;
   3848     }
   3849     domain->query = ioloop_dnssd_txn_add(sdref, domain, srpl_domain_context_release, srpl_dnssd_txn_fail);
   3850     if (domain->query == NULL) {
   3851         ERROR("Unable to set up ioloop transaction for NS query on " PRI_S_SRP, domain->name);
   3852         DNSServiceRefDeallocate(sdref);
   3853         return false;
   3854     }
   3855     RETAIN_HERE(domain, srpl_domain); // For the browse
   3856     return true;
   3857 }
   3858 
   3859 srpl_domain_t *
   3860 srpl_domain_create_or_copy(srp_server_t *server_state, const char *domain_name)
   3861 {
   3862     srpl_domain_t **dp, *domain;
   3863 
   3864     // Find the domain, if it's already there.
   3865     for (dp = &server_state->srpl_domains; *dp; dp = &(*dp)->next) {
   3866         domain = *dp;
   3867         if (!strcasecmp(domain->name, domain_name)) {
   3868             break;
   3869         }
   3870     }
   3871 
   3872     // If not there, make it.
   3873     if (*dp == NULL) {
   3874         domain = calloc(1, sizeof(*domain));
   3875         if (domain == NULL || (domain->name = strdup(domain_name)) == NULL) {
   3876             ERROR("Unable to allocate replication structure for domain " PRI_S_SRP, domain_name);
   3877             free(domain);
   3878             return NULL;
   3879         }
   3880         *dp = domain;
   3881         // Hold a reference for the domain list
   3882         RETAIN_HERE(domain, srpl_domain);
   3883         INFO("New service replication browsing domain: " PRI_S_SRP, domain->name);
   3884 
   3885         domain->srpl_opstate = SRPL_OPSTATE_STARTUP;
   3886         domain->server_state = server_state;
   3887         domain->partner_id = srp_random64();
   3888         INFO("generate partner id %" PRIx64 " for domain " PRI_S_SRP, domain->partner_id, domain->name);
   3889     } else {
   3890         ERROR("Unexpected duplicate replication domain: " PRI_S_SRP, domain_name);
   3891         return NULL;
   3892     }
   3893     return domain;
   3894 }
   3895 
   3896 static void
   3897 srpl_domain_add(srp_server_t *server_state, const char *domain_name)
   3898 {
   3899     srpl_domain_t *domain = srpl_domain_create_or_copy(server_state, domain_name);
   3900     if (domain == NULL) {
   3901         return;
   3902     }
   3903 
   3904     domain->partner_discovery_timeout = ioloop_wakeup_create();
   3905     if (domain->partner_discovery_timeout) {
   3906         ioloop_add_wake_event(domain->partner_discovery_timeout, domain,
   3907                               srpl_partner_discovery_timeout, srpl_domain_context_release,
   3908                               MIN_PARTNER_DISCOVERY_INTERVAL + srp_random16() % PARTNER_DISCOVERY_INTERVAL_RANGE);
   3909         RETAIN_HERE(domain, srpl_domain);
   3910     } else {
   3911         ERROR("unable to add wakeup event for partner discovery for domain " PRI_S_SRP, domain->name);
   3912         return;
   3913     }
   3914     domain->partner_discovery_pending = true;
   3915 
   3916     // Start a browse on the domain.
   3917     if (!srpl_domain_browse_start(domain)) {
   3918         return;
   3919     }
   3920 }
   3921 
   3922 static void
   3923 srpl_domain_rename(const char *current_name, const char *new_name)
   3924 {
   3925     ERROR("replication domain " PRI_S_SRP " renamed to " PRI_S_SRP ", not currently handled.", current_name, new_name);
   3926 }
   3927 
   3928 // Note that when this is implemented, it has the potential to return new thread domain names more than once, so
   3929 // in principle we need to change the name of the domain we are advertising.
   3930 static cti_status_t
   3931 cti_get_thread_network_name(void *context, cti_string_property_reply_t NONNULL callback,
   3932                             run_context_t NULLABLE UNUSED client_queue)
   3933 {
   3934     callback(context, "openthread", kCTIStatus_NoError);
   3935     return kCTIStatus_NoError;
   3936 }
   3937 
   3938 //
   3939 // Event apply functions, print functions, and state actions, generally in order
   3940 //
   3941 
   3942 static bool
   3943 event_is_message(srpl_event_t *event)
   3944 {
   3945     switch(event->event_type) {
   3946     case srpl_event_invalid:
   3947     case srpl_event_address_add:
   3948     case srpl_event_address_remove:
   3949     case srpl_event_server_disconnect:
   3950     case srpl_event_reconnect_timer_expiry:
   3951     case srpl_event_disconnected:
   3952     case srpl_event_connected:
   3953     case srpl_event_advertise_finished:
   3954     case srpl_event_srp_client_update_finished:
   3955     case srpl_event_do_sync:
   3956         return false;
   3957 
   3958     case srpl_event_session_response_received:
   3959     case srpl_event_send_candidates_response_received:
   3960     case srpl_event_candidate_received:
   3961     case srpl_event_host_message_received:
   3962     case srpl_event_candidate_response_received:
   3963     case srpl_event_host_response_received:
   3964     case srpl_event_session_message_received:
   3965     case srpl_event_send_candidates_message_received:
   3966         return true;
   3967     }
   3968     return false;
   3969 }
   3970 
   3971 // States that require an instance (most states). We also validate the chain up to the server state, because
   3972 // it's possible for that to go away and yet still for one last event to arrive, at least in principle.
   3973 #define REQUIRE_SRPL_INSTANCE(srpl_connection)                                                              \
   3974     do {                                                                                                    \
   3975         if ((srpl_connection)->instance == NULL || (srpl_connection)->instance->domain == NULL ||           \
   3976             (srpl_connection)->instance->domain->server_state == NULL) {                                    \
   3977             ERROR(PRI_S_SRP ": no instance in state " PUB_S_SRP, srpl_connection->name,                     \
   3978                   srpl_connection->state_name);                                                             \
   3979             return srpl_state_invalid;                                                                      \
   3980         }                                                                                                   \
   3981     } while(false)
   3982 
   3983 // For states that never receive events.
   3984 #define REQUIRE_SRPL_EVENT_NULL(srpl_connection, event)                                      \
   3985     do {                                                                                     \
   3986         if ((event) != NULL) {                                                               \
   3987             ERROR(PRI_S_SRP ": received unexpected " PUB_S_SRP " event in state " PUB_S_SRP, \
   3988                   srpl_connection->name, event->name, srpl_connection->state_name);          \
   3989             return srpl_state_invalid;                                                       \
   3990         }                                                                                    \
   3991     } while (false)
   3992 
   3993 // Announce that we have entered a state that takes no events
   3994 #define STATE_ANNOUNCE_NO_EVENTS(srpl_connection)                                                          \
   3995     do {                                                                                                   \
   3996         INFO(PRI_S_SRP ": entering state " PUB_S_SRP, srpl_connection->name, srpl_connection->state_name); \
   3997     } while (false)
   3998 
   3999 // Announce that we have entered a state that takes no events
   4000 #define STATE_ANNOUNCE_NO_EVENTS_NAME(connection, fqdn)                                                    \
   4001     do {                                                                                                   \
   4002         char hostname[kDNSServiceMaxDomainName];                                                           \
   4003         dns_name_print(fqdn, hostname, sizeof(hostname));                                                  \
   4004         INFO(PRI_S_SRP ": entering state " PUB_S_SRP " with host " PRI_S_SRP,                              \
   4005             connection->name, connection->state_name, hostname);                                           \
   4006     } while (false)
   4007 
   4008 // Announce that we have entered a state that takes no events
   4009 #define STATE_ANNOUNCE(srpl_connection, event)                                                       \
   4010     do {                                                                                             \
   4011         if (event != NULL)  {                                                                        \
   4012             INFO(PRI_S_SRP ": event " PUB_S_SRP " received in state " PUB_S_SRP,                     \
   4013                  srpl_connection->name, event->name, srpl_connection->state_name);                   \
   4014         } else {                                                                                     \
   4015             INFO(PRI_S_SRP ": entering state " PUB_S_SRP,                                            \
   4016             srpl_connection->name, srpl_connection->state_name);                                     \
   4017         }                                                                                            \
   4018     } while (false)
   4019 
   4020 #define UNEXPECTED_EVENT_MAIN(srpl_connection, event, bad)                             \
   4021     do {                                                                               \
   4022         if (event_is_message(event)) {                                                 \
   4023             INFO(PRI_S_SRP ": invalid event " PUB_S_SRP " in state " PUB_S_SRP,        \
   4024                  (srpl_connection)->name, (event)->name, srpl_connection->state_name); \
   4025             return bad;                                                                \
   4026         }                                                                              \
   4027         INFO(PRI_S_SRP ": unexpected event " PUB_S_SRP " in state " PUB_S_SRP,         \
   4028             (srpl_connection)->name, (event)->name,                                    \
   4029              srpl_connection->state_name);                                             \
   4030         return srpl_state_invalid;                                                     \
   4031     } while (false)
   4032 
   4033 // UNEXPECTED_EVENT flags the response as bad on a protocol level, triggering a retry delay
   4034 // UNEXPECTED_EVENT_NO_ERROR doesn't.
   4035 #define UNEXPECTED_EVENT(srpl_connection, event) UNEXPECTED_EVENT_MAIN(srpl_connection, event, srpl_state_invalid)
   4036 #define UNEXPECTED_EVENT_NO_ERROR(srpl_connection, event) \
   4037     UNEXPECTED_EVENT_MAIN(srpl_connection, event, srpl_connection_drop_state(srpl_connection->instance, srpl_connection))
   4038 
   4039 static void
   4040 srpl_instance_reconnect(srpl_instance_t *instance)
   4041 {
   4042     srpl_event_t event;
   4043 
   4044     // If we have a new connection, no need to reconnect.
   4045     if (instance->connection != NULL && instance->connection->is_server &&
   4046         SRPL_CONNECTION_IS_CONNECTED(instance->connection))
   4047     {
   4048         INFO(PRI_S_SRP ": we have a valid connection.", instance->instance_name);
   4049         return;
   4050     }
   4051     // We shouldn't have an outgoing connection.
   4052     if (instance->connection != NULL && !instance->connection->is_server &&
   4053         SRPL_CONNECTION_IS_CONNECTED(instance->connection))
   4054     {
   4055         FAULT(PRI_S_SRP ": got to srpl_instance_reconnect with a connected (" PUB_S_SRP ") outgoing connection.",
   4056               instance->instance_name, srpl_state_name(instance->connection->state));
   4057         return;
   4058     }
   4059 
   4060     // Start from the beginning of the address list.
   4061     srpl_instance_address_query_reset(instance);
   4062 
   4063     // If we don't have an srpl_connection at this point, make one.
   4064     if (instance->connection == NULL) {
   4065         INFO(PRI_S_SRP ": instance has no connection.", instance->instance_name);
   4066         instance->connection = srpl_connection_create(instance, true);
   4067         if (instance->connection == NULL) {
   4068             ERROR(PRI_S_SRP ": unable to create srpl_connection", instance->instance_name);
   4069             return;
   4070         }
   4071         srpl_connection_next_state(instance->connection, srpl_state_idle);
   4072     }
   4073 
   4074     // Trigger a reconnect if appropriate
   4075     if (!instance->is_me && instance->domain != NULL &&
   4076         (instance->domain->srpl_opstate == SRPL_OPSTATE_STARTUP || instance->domain->partner_id > instance->partner_id))
   4077     {
   4078         // We might be in some disconnected state other than idle, so first move to idle if that's the case.
   4079         if (instance->connection->state != srpl_state_idle) {
   4080             srpl_connection_next_state(instance->connection, srpl_state_idle);
   4081         }
   4082         srpl_event_initialize(&event, srpl_event_reconnect_timer_expiry);
   4083         srpl_event_deliver(instance->connection, &event);
   4084     } else {
   4085         ERROR(PRI_S_SRP ": reconnect requested but not appropriate: is_me = " PUB_S_SRP
   4086               ", domain = %p, opstate = %d, ddid %" PRIx64 ", idid %" PRIx64,
   4087               instance->instance_name, instance->is_me ? "true" : "false", instance->domain,
   4088               instance->domain == NULL ? -1 : instance->domain->srpl_opstate,
   4089               instance->domain == NULL ? 0 : instance->domain->partner_id, instance->partner_id);
   4090     }
   4091 }
   4092 
   4093 static void
   4094 srpl_instance_reconnect_callback(void *context)
   4095 {
   4096     srpl_instance_reconnect(context);
   4097 }
   4098 
   4099 static srpl_state_t
   4100 srpl_connection_drop_state_delay(srpl_instance_t *instance, srpl_connection_t *srpl_connection, int delay)
   4101 {
   4102     // Schedule a reconnect.
   4103     if (instance->reconnect_timeout == NULL) {
   4104         instance->reconnect_timeout = ioloop_wakeup_create();
   4105     }
   4106     if (instance->reconnect_timeout == NULL) {
   4107         FAULT(PRI_S_SRP "disconnecting, but can't reconnect!", srpl_connection->name);
   4108     } else {
   4109         RETAIN_HERE(instance, srpl_instance); // for the timeout
   4110         ioloop_add_wake_event(instance->reconnect_timeout, instance,
   4111                               srpl_instance_reconnect_callback, srpl_instance_context_release, delay * MSEC_PER_SEC);
   4112     }
   4113 
   4114     if (srpl_connection == instance->connection && srpl_connection->is_server) {
   4115         srpl_connection->retry_delay = delay;
   4116         return srpl_state_retry_delay_send;
   4117     } else {
   4118         return srpl_state_disconnect;
   4119     }
   4120 }
   4121 
   4122 static srpl_state_t
   4123 srpl_connection_drop_state(srpl_instance_t *instance, srpl_connection_t *srpl_connection)
   4124 {
   4125     if (instance == NULL) {
   4126         return srpl_state_disconnect;
   4127     } else if (instance->unmatched) {
   4128         if (instance->connection == srpl_connection) {
   4129             RELEASE_HERE(instance->connection, srpl_connection);
   4130             instance->connection = NULL;
   4131         }
   4132         return srpl_state_disconnect;
   4133     } else {
   4134         return srpl_connection_drop_state_delay(instance, srpl_connection, 300);
   4135     }
   4136 }
   4137 
   4138 // Call when there's a protocol error, so that we don't start reconnecting over and over.
   4139 static void
   4140 srpl_disconnect(srpl_connection_t *srpl_connection)
   4141 {
   4142     const int delay = 300; // five minutes
   4143     srpl_instance_t *instance = srpl_connection->instance;
   4144     if (instance != NULL && srpl_connection->connection != NULL) {
   4145         srpl_state_t state = srpl_connection_drop_state_delay(instance, srpl_connection, delay);
   4146         if (state == srpl_state_retry_delay_send) {
   4147             srpl_retry_delay_send(srpl_connection, delay);
   4148         }
   4149     }
   4150     srpl_connection_discontinue(srpl_connection);
   4151 }
   4152 
   4153 static void
   4154 srpl_connection_state_timeout(void *context)
   4155 {
   4156     srpl_connection_t *srpl_connection = context;
   4157     srpl_instance_t *instance = srpl_connection->instance;
   4158     srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   4159 
   4160     // Connection might have been discontinued before we came back.
   4161     if (instance == NULL) {
   4162         return;
   4163     }
   4164 
   4165     // If the srpl connection has been in the current state for timeout and not received any
   4166     // event to get out of the current state, we assume the peer is gone or unavailable and
   4167     // can exclude this instance when we make a decision to enter the routine state.
   4168     instance->sync_fail = true;
   4169     INFO("connection for instance " PRI_S_SRP " timed out in state " PUB_S_SRP,
   4170          instance->instance_name, srpl_connection->state_name);
   4171     if (instance != NULL && instance->sync_to_join &&
   4172         domain != NULL && domain->srpl_opstate != SRPL_OPSTATE_ROUTINE)
   4173     {
   4174         instance->sync_to_join = false;
   4175         srpl_maybe_sync_or_transition(domain);
   4176     }
   4177 }
   4178 
   4179 static void
   4180 srpl_connection_schedule_state_timeout(srpl_connection_t *srpl_connection, uint32_t when)
   4181 {
   4182     // Create a state timer on the srpl_connection_t
   4183     if (srpl_connection->state_timeout == NULL) {
   4184         srpl_connection->state_timeout = ioloop_wakeup_create();
   4185         if (srpl_connection->state_timeout == NULL) {
   4186             ERROR("no memory for state_timeout for service instance " PRI_S_SRP, srpl_connection->name);
   4187             return;
   4188         }
   4189     } else {
   4190         ioloop_cancel_wake_event(srpl_connection->state_timeout);
   4191     }
   4192     ioloop_add_wake_event(srpl_connection->state_timeout, srpl_connection, srpl_connection_state_timeout,
   4193                           srpl_connection_context_release, when);
   4194     RETAIN_HERE(srpl_connection, srpl_connection); // the timer has a reference.
   4195     return;
   4196 }
   4197 
   4198 // We arrive at the disconnected state when there is no connection to make, or no need to make a connection.
   4199 // This state takes no action, but waits for events. If we get an add event and we don't have a viable incoming
   4200 // connection, we go to the next_address_get event.
   4201 static srpl_state_t
   4202 srpl_disconnected_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4203 {
   4204     STATE_ANNOUNCE(srpl_connection, event);
   4205 
   4206     if (event == NULL) {
   4207         srpl_connection_schedule_state_timeout(srpl_connection, SRPL_STATE_TIMEOUT);
   4208         return srpl_state_invalid;
   4209     } else if (event->event_type == srpl_event_address_add) {
   4210         ioloop_cancel_wake_event(srpl_connection->state_timeout);
   4211         return srpl_state_next_address_get;
   4212     } else {
   4213         UNEXPECTED_EVENT(srpl_connection, event);
   4214     }
   4215 }
   4216 
   4217 static void
   4218 srpl_instance_address_query_reset(srpl_instance_t *instance)
   4219 {
   4220     for (srpl_instance_service_t *service = instance->services; service != NULL; service = service->next) {
   4221         address_query_t *address_query = service->address_query;
   4222         if (address_query != NULL && address_query->num_addresses > 0) {
   4223             address_query->cur_address = -1;
   4224         }
   4225     }
   4226 }
   4227 
   4228 // This state takes the action of looking for an address to try. This can have three outcomes:
   4229 //
   4230 // * No addresses available: go to the disconnected state
   4231 // * End of address list: go to the reconnect_wait state
   4232 // * Address found: got to the connect state
   4233 
   4234 static srpl_state_t
   4235 srpl_next_address_get_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4236 {
   4237     address_query_t *address_query = NULL;
   4238     srpl_instance_t *instance;
   4239     srpl_instance_service_t *service;
   4240     bool no_address = true;
   4241     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   4242     REQUIRE_SRPL_INSTANCE(srpl_connection);
   4243     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   4244 
   4245     instance = srpl_connection->instance;
   4246     // Get the next address
   4247     // Return an event, one of "next address", "end of address list" or "no addresses"
   4248     for (service = instance->services; service != NULL; service = service->next) {
   4249         address_query = service->address_query;
   4250         if (address_query == NULL || address_query->num_addresses == 0) {
   4251             continue;
   4252         } else {
   4253             no_address = false;
   4254             if (address_query->cur_address == address_query->num_addresses ||
   4255                 ++address_query->cur_address == address_query->num_addresses)
   4256             {
   4257                 continue;
   4258             } else {
   4259                 memcpy(&srpl_connection->connected_address,
   4260                        &address_query->addresses[address_query->cur_address], sizeof(addr_t));
   4261                 if (srpl_connection->connected_address.sa.sa_family == AF_INET) {
   4262                     srpl_connection->connected_address.sin.sin_port = htons(service->outgoing_port);
   4263                 } else {
   4264                     srpl_connection->connected_address.sin6.sin6_port = htons(service->outgoing_port);
   4265                 }
   4266                 return srpl_state_connect;
   4267             }
   4268         }
   4269      }
   4270 
   4271      if (no_address) {
   4272          return srpl_state_disconnected;
   4273      } else {
   4274          srpl_instance_address_query_reset(instance);
   4275          return srpl_state_reconnect_wait;
   4276      }
   4277 }
   4278 
   4279 // This state takes the action of connecting to the connection's current address, which is expected to have
   4280 // been set. This can have two outcomes:
   4281 //
   4282 // * The connect attempt fails immediately: go to the next_address_get state
   4283 // * The connection attempt is in progress: go to the connecting state
   4284 static srpl_state_t
   4285 srpl_connect_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4286 {
   4287     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   4288     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   4289     REQUIRE_SRPL_INSTANCE(srpl_connection);
   4290 
   4291     // Connect to the address from the event.
   4292     if (!srpl_connection_connect(srpl_connection)) {
   4293         return srpl_state_next_address_get;
   4294     } else {
   4295         return srpl_state_connecting;
   4296     }
   4297 }
   4298 
   4299 // We reach this state when we are disconnected and don't need to reconnect because we have an active server
   4300 // connection. If we get a server disconnect here, then we go to the next_address_get state.
   4301 static srpl_state_t
   4302 srpl_idle_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4303 {
   4304     REQUIRE_SRPL_INSTANCE(srpl_connection);
   4305     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   4306     if (event == NULL) {
   4307         srpl_connection_schedule_state_timeout(srpl_connection, SRPL_STATE_TIMEOUT);
   4308         return srpl_state_invalid;
   4309     } else if (event->event_type == srpl_event_server_disconnect ||
   4310                event->event_type == srpl_event_reconnect_timer_expiry)
   4311     {
   4312         ioloop_cancel_wake_event(srpl_connection->state_timeout);
   4313         INFO(PRI_S_SRP ": event " PUB_S_SRP " received in state " PUB_S_SRP,
   4314              srpl_connection->name, event->name, srpl_connection->state_name);
   4315         return srpl_state_next_address_get;
   4316     } else {
   4317         // We don't log unhandled events in the idle state because it creates a lot of noise.
   4318         return srpl_state_invalid;
   4319     }
   4320 }
   4321 
   4322 static void
   4323 srpl_maybe_propose_new_dataset_id(srpl_domain_t *domain)
   4324 {
   4325     if (domain->have_dataset_id) {
   4326         for(srpl_instance_t *instance = domain->instances; instance != NULL; instance = instance->next)
   4327         {
   4328             // as long as there's one instance of the proposed dataset id that has not failed
   4329             // to sync, we are going to wait
   4330             if (!instance->sync_fail &&
   4331                 instance->dataset_id == domain->dataset_id)
   4332             {
   4333                 INFO("instance " PRI_S_SRP " has matched dataset_id %" PRIx64,
   4334                      instance->instance_name, instance->dataset_id);
   4335                 return;
   4336             }
   4337         }
   4338     }
   4339     domain->have_dataset_id = srpl_instances_max_dataset_id(domain, &domain->dataset_id);
   4340     INFO(PRI_S_SRP "propose a new dataset id %" PRIx64, domain->have_dataset_id? "": "fail to ", domain->dataset_id);
   4341 }
   4342 
   4343 // We've received a timeout event on the reconnect timer. Generate a reconnect_timeout event and send it to the
   4344 // connection.
   4345 static void
   4346 srpl_connection_reconnect_timeout(void *context)
   4347 {
   4348     srpl_connection_t *srpl_connection = context;
   4349     srpl_instance_t *instance = srpl_connection->instance;
   4350     srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   4351 
   4352     INFO("reconnect timeout on " PRI_S_SRP, srpl_connection->name);
   4353     srpl_event_t event;
   4354     srpl_event_initialize(&event, srpl_event_reconnect_timer_expiry);
   4355     srpl_event_deliver(srpl_connection, &event);
   4356     // If we have tried to connect to all the addresses but failed, we assume the peer is
   4357     // gone. We no longer need to synchronize with this peer and if this was an obstacle
   4358     // to enter the routine state, we should recheck again.
   4359     instance->sync_fail = true;
   4360     INFO("fail to sync with instance " PRI_S_SRP, instance->instance_name);
   4361     if (instance != NULL && instance->sync_to_join &&
   4362         domain != NULL && domain->srpl_opstate != SRPL_OPSTATE_ROUTINE)
   4363     {
   4364         instance->sync_to_join = false;
   4365         srpl_maybe_sync_or_transition(domain);
   4366     }
   4367 }
   4368 
   4369 static srpl_state_t
   4370 srpl_connection_schedule_reconnect_event(srpl_connection_t *srpl_connection, uint32_t when)
   4371 {
   4372     // Create a reconnect timer on the srpl_connection_t
   4373     if (srpl_connection->reconnect_wakeup == NULL) {
   4374         srpl_connection->reconnect_wakeup = ioloop_wakeup_create();
   4375         if (srpl_connection->reconnect_wakeup == NULL) {
   4376             ERROR("no memory for reconnect_wakeup for service instance " PRI_S_SRP, srpl_connection->name);
   4377             return srpl_state_invalid;
   4378         }
   4379     } else {
   4380         ioloop_cancel_wake_event(srpl_connection->reconnect_wakeup);
   4381     }
   4382     ioloop_add_wake_event(srpl_connection->reconnect_wakeup, srpl_connection, srpl_connection_reconnect_timeout,
   4383                           srpl_connection_context_release, when);
   4384     RETAIN_HERE(srpl_connection, srpl_connection); // the timer has a reference.
   4385     return srpl_state_invalid;
   4386 }
   4387 
   4388 // We reach the set_reconnect_timer state when we have tried to connect to all the known addresses.  Once we have set a
   4389 // timer, we wait for events. If we get a reconnect_timeout event, we go to the next_address_get state. If we get an
   4390 // add_adress event, we cancel the retransmit timer and go to the next_address_get state.
   4391 static srpl_state_t
   4392 srpl_reconnect_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4393 {
   4394     REQUIRE_SRPL_INSTANCE(srpl_connection);
   4395     STATE_ANNOUNCE(srpl_connection, event);
   4396     srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   4397 
   4398     if (event == NULL) {
   4399         return srpl_connection_schedule_reconnect_event(srpl_connection, 60 * 1000);
   4400     }
   4401     if (event->event_type == srpl_event_reconnect_timer_expiry) {
   4402         // if it's not our job to reconnect, we move into idle.
   4403         if (domain->srpl_opstate != SRPL_OPSTATE_STARTUP &&
   4404             domain->partner_id < srpl_connection->instance->partner_id)
   4405         {
   4406             return srpl_state_idle;
   4407         }
   4408         return srpl_state_next_address_get;
   4409     } else if (event->event_type == srpl_event_address_add) {
   4410         ioloop_cancel_wake_event(srpl_connection->reconnect_wakeup);
   4411         // if it's not our job to reconnect, we move into idle.
   4412         if (domain->srpl_opstate != SRPL_OPSTATE_STARTUP &&
   4413             domain->partner_id < srpl_connection->instance->partner_id)
   4414         {
   4415             return srpl_state_idle;
   4416         }
   4417         return srpl_state_next_address_get;
   4418     }
   4419     UNEXPECTED_EVENT(srpl_connection, event);
   4420 }
   4421 
   4422 // We get to this state when the remote end has sent something bogus; in this case we send a retry_delay message to
   4423 // tell the client not to reconnect for a while.
   4424 static srpl_state_t
   4425 srpl_retry_delay_send_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4426 {
   4427     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   4428     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   4429 
   4430     srpl_retry_delay_send(srpl_connection, srpl_connection->retry_delay);
   4431     return srpl_state_disconnect;
   4432 }
   4433 
   4434 // We go to the disconnect state when the connection needs to be dropped either because we lost the session ID
   4435 // coin toss or something's gone wrong. In either case, we do not attempt to reconnect--we either go to the idle state
   4436 // or the disconnect_wait state, depending on whether or not the connection has already been closed.
   4437 static srpl_state_t
   4438 srpl_disconnect_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4439 {
   4440     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   4441     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   4442 
   4443     // Any ongoing state needs to be discarded.
   4444     srpl_connection_reset(srpl_connection);
   4445 
   4446     // Disconnect the srpl_connection_t
   4447     if (srpl_connection->connection == NULL) {
   4448         return srpl_state_idle;
   4449     }
   4450     srpl_trigger_disconnect(srpl_connection);
   4451     return srpl_state_disconnect_wait;
   4452 }
   4453 
   4454 // We enter disconnect_wait when we are waiting for a disconnect event after cancelling a connection.
   4455 // There is no action for this event. The only event we are interested in is the disconnect event.
   4456 static srpl_state_t
   4457 srpl_disconnect_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4458 {
   4459     STATE_ANNOUNCE(srpl_connection, event);
   4460     if (event == NULL) {
   4461         return srpl_state_invalid;
   4462     } else if (event->event_type == srpl_event_disconnected) {
   4463         return srpl_state_idle;
   4464     } else {
   4465         UNEXPECTED_EVENT_NO_ERROR(srpl_connection, event);
   4466     }
   4467     return srpl_state_invalid;
   4468 }
   4469 
   4470 // We enter the connecting state when we've attempted a connection to some address.
   4471 // This state has no action. If a connected event is received, we move to the connected state.
   4472 // If a disconnected event is received, we move to the next_address_get state.
   4473 static srpl_state_t
   4474 srpl_connecting_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4475 {
   4476     STATE_ANNOUNCE(srpl_connection, event);
   4477     if (event == NULL) {
   4478         srpl_connection_schedule_state_timeout(srpl_connection, SRPL_STATE_TIMEOUT);
   4479         return srpl_state_invalid;
   4480     } else if (event->event_type == srpl_event_disconnected) {
   4481         ioloop_cancel_wake_event(srpl_connection->state_timeout);
   4482         // We tried to connect and the connection failed. This may mean that the information we see in the _srpl-tls.tcp
   4483         // advertisement is wrong, or that the address records are wrong. Reconfirm the records.
   4484         srpl_reconfirm(srpl_connection);
   4485         return srpl_state_next_address_get;
   4486     } else if (event->event_type == srpl_event_connected) {
   4487         ioloop_cancel_wake_event(srpl_connection->state_timeout);
   4488         return srpl_state_session_send;
   4489     } else {
   4490         UNEXPECTED_EVENT_NO_ERROR(srpl_connection, event);
   4491     }
   4492     return srpl_state_invalid;
   4493 }
   4494 
   4495 static void
   4496 srpl_sync_wait_check(void *context)
   4497 {
   4498     srpl_connection_t *srpl_connection = context;
   4499 
   4500     if (srpl_connection->instance == NULL) {
   4501         FAULT("srpl_connection->instance shouldn't ever be NULL here, but it is.");
   4502         return;
   4503     }
   4504     if (srpl_connection->instance->domain == NULL) {
   4505         FAULT("srpl_connection->instance->domain shouldn't ever be NULL here, but it is.");
   4506         return;
   4507     }
   4508     if (!srpl_connection->instance->domain->partner_discovery_pending) {
   4509         srpl_maybe_sync_or_transition(srpl_connection->instance->domain);
   4510     }
   4511 }
   4512 
   4513 static srpl_state_t
   4514 srpl_sync_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4515 {
   4516     STATE_ANNOUNCE(srpl_connection, event);
   4517     if (event == NULL) {
   4518         // This will trigger a do_sync event if we should synchronize at this point.
   4519         ioloop_run_async(srpl_sync_wait_check, srpl_connection);
   4520         return srpl_state_invalid;
   4521     } else if (event->event_type == srpl_event_do_sync) {
   4522         // When starting to sync, we reset the keepalive_interval so that we can detect
   4523         // the problem sooner during synchronization.
   4524         srpl_connection->keepalive_interval = DEFAULT_KEEPALIVE_WAKEUP_EXPIRY / 2;
   4525         return srpl_state_send_candidates_send;
   4526     } else {
   4527         UNEXPECTED_EVENT_NO_ERROR(srpl_connection, event);
   4528     }
   4529     return srpl_state_invalid;
   4530 }
   4531 
   4532 // This state sends a SRPL session message and then goes to session_response_wait, unless the send failed, in which
   4533 // case it goes to the disconnect state.
   4534 static srpl_state_t
   4535 srpl_session_send_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4536 {
   4537     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   4538     REQUIRE_SRPL_INSTANCE(srpl_connection);
   4539     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   4540 
   4541     // Send a session message
   4542     // Now we say hello.
   4543     if (!srpl_session_message_send(srpl_connection, false)) {
   4544         return srpl_state_disconnect;
   4545     }
   4546     return srpl_state_session_response_wait;
   4547 }
   4548 
   4549 // This state waits for a session response with the remote partner ID and whether
   4550 // the remote partner is in startup state.
   4551 // When the response arrives, it goes to the send_candidates_send state.
   4552 static srpl_state_t
   4553 srpl_session_response_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4554 {
   4555     STATE_ANNOUNCE(srpl_connection, event);
   4556     if (event == NULL) {
   4557         return srpl_state_invalid;
   4558     } else if (event->event_type == srpl_event_session_response_received) {
   4559         // we are connecting and have sent the peer a session message. if the received session
   4560         // response shows that the peer has a lower version number, we don't have to sync with
   4561         // this peer to join the replication. we also note version_mismatch on the corresponding
   4562         // instance and move to the idle state. when we move to the routine state later, we'll
   4563         // pick a dataset id so that the sequence number of our anycast service would win over
   4564         // the low-version peers.
   4565         if (event->content.session.remote_version < SRPL_CURRENT_VERSION) { // version mismatch
   4566             INFO("instance " PRI_S_SRP " has a lower version number", srpl_connection->instance->instance_name);
   4567             srpl_connection->instance->version_mismatch = true;
   4568             srpl_connection->instance->sync_to_join = false;
   4569             return srpl_state_idle;
   4570         }
   4571 
   4572         srpl_connection->remote_partner_id = event->content.session.partner_id;
   4573         srpl_connection->new_partner = event->content.session.new_partner;
   4574         srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   4575         // if we are already in the routine state, we can directly move forward
   4576         // with sync; otherwise we put the srpl connection in the sync_wait state
   4577         // where we check the number of active srp servers to decide if we should
   4578         // continue sync at this point.
   4579         if (domain->srpl_opstate == SRPL_OPSTATE_ROUTINE) {
   4580             return srpl_state_send_candidates_send;
   4581         }
   4582         return srpl_state_sync_wait;
   4583     } else {
   4584         UNEXPECTED_EVENT(srpl_connection, event);
   4585     }
   4586     return srpl_state_invalid;
   4587 }
   4588 
   4589 // When evaluating the incoming session, we've decided to continue (called by srpl_session_evaluate_action).
   4590 static srpl_state_t
   4591 srpl_evaluate_incoming_continue(srpl_connection_t *srpl_connection)
   4592 {
   4593     srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   4594 
   4595     if (srpl_connection->new_partner) {
   4596         INFO(PRI_S_SRP " connecting partner is in startup state", srpl_connection->name);
   4597     } else {
   4598         INFO(PRI_S_SRP ": my partner id %" PRIx64 " < connecting partner id %" PRIx64,
   4599              srpl_connection->name, domain->partner_id, srpl_connection->remote_partner_id);
   4600     }
   4601     if (srpl_connection->is_server) {
   4602         return srpl_state_session_response_send;
   4603     } else {
   4604         return srpl_state_send_candidates_send;
   4605     }
   4606 }
   4607 
   4608 // When evaluating the incoming ID, we've decided to disconnect (called by srpl_session_evaluate_action).
   4609 static srpl_state_t
   4610 srpl_evaluate_incoming_disconnect(srpl_connection_t *srpl_connection, bool bad)
   4611 {
   4612     srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   4613 
   4614     if (domain->srpl_opstate != SRPL_OPSTATE_ROUTINE) {
   4615         INFO(PRI_S_SRP ": not in the routine state yet in domain " PRI_S_SRP,
   4616              srpl_connection->name, domain->name);
   4617     } else {
   4618         INFO(PRI_S_SRP ": my partner id %" PRIx64 " > connectiong partner id %" PRIx64,
   4619              srpl_connection->name, domain->partner_id, srpl_connection->remote_partner_id);
   4620     }
   4621     if (srpl_connection->instance->is_me) {
   4622         return srpl_evaluate_incoming_continue(srpl_connection);
   4623     } else {
   4624         if (bad) {
   4625             // bad is set if the server send back the same ID we sent, which means it's misbehaving.
   4626             return srpl_connection_drop_state(srpl_connection->instance, srpl_connection);
   4627         }
   4628         return srpl_state_disconnect;
   4629     }
   4630 }
   4631 
   4632 // This state's action is to evaluate if the partner should accept the connection.
   4633 // The receiving partner accepts the connection if the connecting partner is in the
   4634 // "startup" state (flaged as new partner), or the receiving partner's ID is smaller
   4635 // than the connecting partner's ID. Otherwise, the receiving partner disconnects.
   4636 static srpl_state_t
   4637 srpl_session_evaluate_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4638 {
   4639     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   4640     REQUIRE_SRPL_INSTANCE(srpl_connection);
   4641     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   4642     srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   4643 
   4644     // The receiving partner must be in routine state to accept the connection.
   4645     // Recceiving connection in startup state should not happen, but add a guard
   4646     // here anyway to protect against such situation.
   4647     if (domain->srpl_opstate == SRPL_OPSTATE_ROUTINE && (srpl_connection->new_partner ||
   4648         domain->partner_id < srpl_connection->remote_partner_id))
   4649     {
   4650         return srpl_evaluate_incoming_continue(srpl_connection);
   4651     } else {
   4652         return srpl_evaluate_incoming_disconnect(srpl_connection, false);
   4653     }
   4654 }
   4655 
   4656 // This state's action is to send the "send candidates" message, and then go to the send_candidates_wait state.
   4657 static srpl_state_t
   4658 srpl_send_candidates_send_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4659 {
   4660     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   4661     REQUIRE_SRPL_INSTANCE(srpl_connection);
   4662     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   4663 
   4664     // Send "send candidates" message
   4665     // Return no event
   4666     srpl_send_candidates_message_send(srpl_connection, false);
   4667     return srpl_state_send_candidates_wait;
   4668 }
   4669 
   4670 static bool
   4671 srpl_can_transition_to_routine_state(srpl_domain_t *domain)
   4672 {
   4673     if (domain == NULL) {
   4674         INFO("returning false because there's no domain");
   4675         return false;
   4676     }
   4677 
   4678     // We only transition to routine state after discovery is completed, as
   4679     // we need to sync with all the partners discovered during the discovery
   4680     // window to join the replication
   4681     if (domain->partner_discovery_pending) {
   4682         INFO("returning false because partner discovery is still pending");
   4683         return false;
   4684     }
   4685 
   4686     if (domain->srpl_opstate == SRPL_OPSTATE_ROUTINE) {
   4687         INFO("returning false because we are already in routine state");
   4688         return false;
   4689     }
   4690 
   4691     if (domain->have_dataset_id) {
   4692         for (srpl_instance_t *instance = domain->instances; instance != NULL; instance = instance->next) {
   4693             // We skip checking the instance if the instance
   4694             // 1. is to myself (this could happen if I restarts and receives a stale advertisement from myself).
   4695             // 2. is not discovered during the discovery_timeout, or
   4696             // 3. the dataset id is not what we are looking for, or
   4697             // 4. is discontinuing, or
   4698             // 5. has lower version
   4699             if (instance->is_me || !instance->sync_to_join ||
   4700                 instance->dataset_id != domain->dataset_id ||
   4701                 instance->discontinuing || instance->version_mismatch)
   4702             {
   4703                 INFO("instance " PUB_S_SRP ": is_me (" PUB_S_SRP ") or sync_to_join (" PUB_S_SRP
   4704                      ") or discontinuing (" PUB_S_SRP ") or version_mismatch (" PUB_S_SRP ")",
   4705                      instance->instance_name, instance->is_me ? "true" : "false", instance->sync_to_join ? "true" : "false",
   4706                      instance->discontinuing ? "true" : "false", instance->version_mismatch ? "true" : "false");
   4707                 continue;
   4708             }
   4709             // Instance is a valid partner that we should sync with to possibly move to the routine state
   4710             if (instance->connection == NULL ||
   4711                 !instance->connection->database_synchronized)
   4712             {
   4713                 INFO("synchronization on " PRI_S_SRP " with partner_id %" PRIx64 " is not ready (%p " PUB_S_SRP ").",
   4714                      instance->instance_name, instance->partner_id, instance->connection,
   4715                      (instance->connection == NULL ? "null" :
   4716                      instance->connection->database_synchronized ? "true" : "false"));
   4717                 return false;
   4718             }
   4719         }
   4720     }
   4721     INFO("ready");
   4722     return true;
   4723 }
   4724 
   4725 static void
   4726 srpl_store_dataset_id(srpl_domain_t *domain)
   4727 {
   4728     uint64_t dataset_id = domain->dataset_id;
   4729     uint8_t msb;
   4730     OSStatus err;
   4731 
   4732     // read out the stored msb of the dataset id. increment the msb and generate the dataset id.
   4733     const CFStringRef app_id = CFSTR("com.apple.srp-mdns-proxy.preferences");
   4734     const CFStringRef key = CFStringCreateWithFormat(kCFAllocatorDefault, NULL, CFSTR("dataset-id-msb-%s"), domain->name);
   4735 
   4736     if (key) {
   4737         msb = (dataset_id & 0xFF00000000000000) >> 56;
   4738         err = CFPrefs_SetInt64(app_id, key, msb);
   4739 
   4740         if (err) {
   4741             ERROR("Unable to store the msb of the dataset id in preferences.");
   4742         }
   4743         INFO("store msb %d.", msb);
   4744         CFRelease(key);
   4745     } else {
   4746         ERROR("unable to create key for domain " PRI_S_SRP, domain->name);
   4747     }
   4748 }
   4749 
   4750 
   4751 // SRPL partners MUST persist the highest (most significant byte or MSB) of the dataset ID.
   4752 // When generating a new dataset ID, the partner MUST increment the MSB of last used dataset
   4753 // ID to use as MSB of new dataset ID and populate the lower 56 bits randomly. If there is no
   4754 // previously saved ID, then the partner randomly generates the entire 64-bit ID.
   4755 static uint64_t
   4756 srpl_generate_store_dataset_id(srpl_domain_t *domain)
   4757 {
   4758     uint64_t dataset_id;
   4759     uint8_t msb;
   4760     OSStatus err;
   4761 
   4762     // read out the stored msb of the dataset id. increment the msb and generate the dataset id.
   4763     const CFStringRef app_id = CFSTR("com.apple.srp-mdns-proxy.preferences");
   4764     const CFStringRef key = CFStringCreateWithFormat(kCFAllocatorDefault, NULL, CFSTR("dataset-id-msb-%s"), domain->name);
   4765 
   4766     if (key) {
   4767         msb = (uint8_t)CFPrefs_GetInt64(app_id, key, &err);
   4768         if (err) {
   4769             INFO("fail to fetch msb, generate random dataset id.");
   4770             dataset_id = srp_random64();
   4771         } else {
   4772             dataset_id = (((uint64_t)msb+1) << 56) | (srp_random64() & LOWER56_BIT_MASK);
   4773         }
   4774         // store the most significant byte (msb) of the generated dataset id
   4775         msb = (dataset_id & 0xFF00000000000000) >> 56;
   4776         err = CFPrefs_SetInt64(app_id, key, msb);
   4777 
   4778         if (err) {
   4779             ERROR("Unable to store the msb of the dataset id in preferences.");
   4780         }
   4781         CFRelease(key);
   4782     } else {
   4783         ERROR("unable to create key for domain " PRI_S_SRP, domain->name);
   4784         dataset_id = srp_random64();
   4785     }
   4786 
   4787     return dataset_id;
   4788 }
   4789 
   4790 static void
   4791 srpl_transition_to_routine_state(srpl_domain_t *domain)
   4792 {
   4793     domain->srpl_opstate = SRPL_OPSTATE_ROUTINE;
   4794     INFO("transitions to routine state in domain " PRI_S_SRP, domain->name);
   4795     // If the partner does not discover any other partners advertising
   4796     // the same domain in the "startup" state, it generates a new dataset
   4797     // ID when entering the "routine operation" state.
   4798     // When generating a new dataset ID, the partner MUST increment the MSB of
   4799     // last used dataset ID to use as MSB of new dataset ID and populate the
   4800     // lower 56 bits randomly using a random number generator. If there is no
   4801     // previously saved ID, then the partner randomly generates the entire 64-bit ID.
   4802     if (!domain->have_dataset_id) {
   4803         domain->dataset_id = srpl_generate_store_dataset_id(domain);
   4804         domain->have_dataset_id = true;
   4805         domain->dataset_id_committed = true;
   4806         INFO("generate new dataset id %" PRIx64 " for domain " PRI_S_SRP,
   4807              domain->dataset_id, domain->name);
   4808     } else {
   4809         // if there are instances that have lower version number but the MSB of their dataset_id
   4810         // is larger than ours, we'll increment the MSB of the instance dataset id and use it
   4811         // as our dataset id. this is to guarantee that the current version has a higher sequence
   4812         // number so that its service will be preferred.
   4813         for (srpl_instance_t *instance = domain->instances; instance != NULL; instance = instance->next) {
   4814             if (instance->version_mismatch &&
   4815                 srpl_dataset_id_compare(instance->dataset_id & 0xFF00000000000000,
   4816                                         domain->dataset_id & 0xFF00000000000000) >= 0)
   4817             {
   4818                 INFO("increment the MSB of the dataset_id %" PRIx64 " of instance " PRI_S_SRP,
   4819                      instance->dataset_id, instance->instance_name);
   4820                 domain->dataset_id = instance->dataset_id + 0x0100000000000000;
   4821             }
   4822         }
   4823     }
   4824 #if STUB_ROUTER
   4825     srp_server_t *server_state = domain->server_state;
   4826     // Advertise the SRPL service in the "routine" state.
   4827     srpl_domain_advertise(domain);
   4828     if (!strcmp(domain->name, server_state->current_thread_domain_name)) {
   4829         route_state_t *route_state = server_state->route_state;
   4830         if (route_state != NULL) {
   4831             route_state->thread_sequence_number = (domain->dataset_id & 0xFF00000000000000) >> 56;
   4832             INFO("thread sequence number 0x%02x", route_state->thread_sequence_number);
   4833             route_state->partition_can_advertise_anycast_service = true;
   4834             partition_maybe_advertise_anycast_service(route_state);
   4835         }
   4836     }
   4837 #endif
   4838 }
   4839 
   4840 static bool
   4841 srpl_keep_current_dataset_id(srpl_domain_t *domain, srpl_instance_t *instance)
   4842 {
   4843     bool keep = true;
   4844     bool stored = domain->dataset_id_committed;
   4845     if (instance->have_dataset_id) {
   4846         uint64_t new = instance->dataset_id;
   4847         int compare = srpl_dataset_id_compare(new, domain->dataset_id);
   4848         if (compare == 0) {
   4849             domain->dataset_id_committed = true;
   4850             keep = true;
   4851             INFO("keep and commit dataset_id %" PRIx64, domain->dataset_id);
   4852         } else if (compare > 0) {
   4853             INFO("abandon dataset_id %" PRIx64 " and commit preferred %" PRIx64, domain->dataset_id, new);
   4854             domain->dataset_id = new;
   4855             domain->dataset_id_committed = true;
   4856             keep = false;
   4857         } else {
   4858             INFO("non-preferred dataset id %" PRIx64, instance->dataset_id);
   4859         }
   4860     }
   4861     // we store the msb of dataset id if we haven't done so for current
   4862     // committed dataset id or the committed dataset id has changed.
   4863     if ((!stored || !keep) && domain->dataset_id_committed) {
   4864         srpl_store_dataset_id(domain);
   4865     }
   4866     return keep;
   4867 }
   4868 
   4869 static void
   4870 srpl_state_transition_by_dataset_id(srpl_domain_t *domain, srpl_instance_t *instance)
   4871 {
   4872     if (!srpl_keep_current_dataset_id(domain, instance)) {
   4873         // DNS-SD SRP Replication Spec: if at any time (regardless of "startup" or "routine
   4874         // operation" state) an SRPL partner discovers that it is synchronizing with a
   4875         // non-preferred dataset ID, it MUST abandon that dataset, re-enter the "startup"
   4876         // state, and attempt to synchronize with the (newly discovered) preferred dataset id.
   4877         INFO("more preferred dataset id %" PRIx64 ", reenter startup state", domain->dataset_id);
   4878         srpl_abandon_nonpreferred_dataset(domain);
   4879         srpl_transition_to_startup_state(domain);
   4880     } else {
   4881         srpl_maybe_sync_or_transition(domain);
   4882     }
   4883 }
   4884 
   4885 // Used by srpl_send_candidates_wait_action and srpl_host_wait_action
   4886 static srpl_state_t
   4887 srpl_send_candidates_wait_event_process(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4888 {
   4889     srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   4890     if (event->event_type == srpl_event_send_candidates_response_received) {
   4891         if (srpl_connection->is_server) {
   4892             srpl_connection->database_synchronized = true;
   4893             srpl_instance_t *instance = srpl_connection->instance;
   4894             instance->sync_fail = false;
   4895             srpl_state_transition_by_dataset_id(domain, instance);
   4896             return srpl_state_ready;
   4897         } else {
   4898             return srpl_state_send_candidates_message_wait;
   4899         }
   4900     } else if (event->event_type == srpl_event_candidate_received) {
   4901         srpl_connection_candidate_set(srpl_connection, event->content.candidate);
   4902         event->content.candidate = NULL; // steal!
   4903         return srpl_state_candidate_check;
   4904     } else {
   4905         UNEXPECTED_EVENT(srpl_connection, event);
   4906     }
   4907 }
   4908 
   4909 // We reach this state after having sent a "send candidates" message, so we can in principle get either a
   4910 // "candidate" message or a "send candidates" response here, leading either to send_candidates check or one
   4911 // of two states depending on whether this connection is an incoming or outgoing connection. Outgoing
   4912 // connections send the "send candidates" message first, so when they get a "send candidates" reply, they
   4913 // need to wait for a "send candidates" message from the remote. Incoming connections send the "send candidates"
   4914 // message last, so when they get the "send candidates" reply, the database sync is done and it's time to
   4915 // just deal with ongoing updates. In this case we go to the check_for_srp_client_updates state, which
   4916 // looks to see if any updates came in from SRP clients while we were syncing the databases.
   4917 static srpl_state_t
   4918 srpl_send_candidates_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4919 {
   4920     REQUIRE_SRPL_INSTANCE(srpl_connection);
   4921     STATE_ANNOUNCE(srpl_connection, event);
   4922 
   4923     if (event == NULL) {
   4924         return srpl_state_invalid; // Wait for events.
   4925     }
   4926     return srpl_send_candidates_wait_event_process(srpl_connection, event);
   4927 }
   4928 
   4929 static srpl_candidate_disposition_t
   4930 srpl_candidate_host_check(srpl_connection_t *srpl_connection, adv_host_t *host)
   4931 {
   4932     // Evaluate candidate
   4933     // Return "host candidate wanted" or "host candidate not wanted" event
   4934     if (host == NULL) {
   4935         INFO("host is NULL, answer is yes.");
   4936         return srpl_candidate_yes;
   4937     } else {
   4938         if (host->removed) {
   4939             INFO("host is removed, answer is yes.");
   4940             return srpl_candidate_yes;
   4941         } else if (host->key_id != srpl_connection->candidate->key_id) {
   4942             INFO("host key conflict (%x vs %x), answer is conflict.", host->key_id, srpl_connection->candidate->key_id);
   4943             return srpl_candidate_conflict;
   4944         } else {
   4945             // We allow for a bit of jitter. Bear in mind that candidates only happen on startup, so
   4946             // even if a previous run of the SRP server on this device was responsible for registering
   4947             // the candidate, we don't have it, so we still need it.
   4948             if (host->update_time - srpl_connection->candidate->update_time > SRPL_UPDATE_JITTER_WINDOW) {
   4949                 INFO("host update time %" PRId64 " candidate update time %" PRId64 ", answer is no.",
   4950                      (int64_t)host->update_time, (int64_t)srpl_connection->candidate->update_time);
   4951                 return srpl_candidate_no;
   4952             } else {
   4953                 INFO("host update time %" PRId64 " candidate update time %" PRId64 ", answer is yes.",
   4954                      (int64_t)host->update_time, (int64_t)srpl_connection->candidate->update_time);
   4955                 return srpl_candidate_yes;
   4956             }
   4957         }
   4958     }
   4959 }
   4960 
   4961 // We enter this state after we've received a "candidate" message, and check to see if we want the host the candidate
   4962 // represents. We then send an appropriate response.
   4963 static srpl_state_t
   4964 srpl_candidate_check_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4965 {
   4966     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   4967     REQUIRE_SRPL_INSTANCE(srpl_connection);
   4968     STATE_ANNOUNCE_NO_EVENTS_NAME(srpl_connection, srpl_connection->candidate->name);
   4969 
   4970     adv_host_t *host = srp_adv_host_copy(srpl_connection->instance->domain->server_state,
   4971                                          srpl_connection->candidate->name);
   4972     srpl_candidate_disposition_t disposition = srpl_candidate_host_check(srpl_connection, host);
   4973     if (host != NULL) {
   4974         srp_adv_host_release(host);
   4975     }
   4976     switch(disposition) {
   4977     case srpl_candidate_yes:
   4978         srpl_candidate_response_send(srpl_connection, kDSOType_SRPLCandidateYes);
   4979         return srpl_state_candidate_host_wait;
   4980     case srpl_candidate_no:
   4981         srpl_candidate_response_send(srpl_connection, kDSOType_SRPLCandidateNo);
   4982         return srpl_state_send_candidates_wait;
   4983     case srpl_candidate_conflict:
   4984         srpl_candidate_response_send(srpl_connection, kDSOType_SRPLConflict);
   4985         return srpl_state_send_candidates_wait;
   4986     }
   4987     return srpl_state_invalid;
   4988 }
   4989 
   4990 // In candidate_host_send_wait, we take no action and wait for events. We're hoping for a "host" message, leading to
   4991 // candidate_host_prepare. We could also receive a "candidate" message, leading to candidate_received, or a "send
   4992 // candidates" reply, leading to candidate_reply_received.
   4993 
   4994 static srpl_state_t
   4995 srpl_candidate_host_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   4996 {
   4997     REQUIRE_SRPL_INSTANCE(srpl_connection);
   4998     STATE_ANNOUNCE(srpl_connection, event);
   4999 
   5000     if (event == NULL) {
   5001         return srpl_state_invalid; // Wait for events.
   5002     } else if (event->event_type == srpl_event_host_message_received) {
   5003         // Copy the update information, retain what's refcounted, and free what's not on the event.
   5004         srpl_host_update_steal_parts(&srpl_connection->stashed_host, &event->content.host_update);
   5005         return srpl_state_candidate_host_prepare;
   5006     } else {
   5007         return srpl_send_candidates_wait_event_process(srpl_connection, event);
   5008     }
   5009 }
   5010 
   5011 // Here we want to see if we can do an immediate update; if so, we go to candidate_host_re_evaluate; otherwise
   5012 // we go to candidate_host_contention_wait
   5013 static srpl_state_t
   5014 srpl_candidate_host_prepare_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5015 {
   5016     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5017     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5018     STATE_ANNOUNCE_NO_EVENTS_NAME(srpl_connection, srpl_connection->candidate->name);
   5019 
   5020     // Apply the host from the event to the current host list
   5021     // Return no event
   5022     adv_host_t *host = srp_adv_host_copy(srpl_connection->instance->domain->server_state,
   5023                                          srpl_connection->candidate->name);
   5024     if (host == NULL) {
   5025         // If we don't have this host, we can apply the update immediately.
   5026         return srpl_state_candidate_host_apply;
   5027     }
   5028     if (host->srpl_connection != NULL || host->update != NULL) {
   5029         // We are processing an update from a different srpl server or a client.
   5030         INFO(PRI_S_SRP ": host->srpl_connection = %p  host->update=%p--going into contention",
   5031              srpl_connection->name, host->srpl_connection, host->update);
   5032         srp_adv_host_release(host);
   5033         return srpl_state_candidate_host_contention_wait;
   5034     } else {
   5035         srpl_connection->candidate->host = host;
   5036         return srpl_state_candidate_host_re_evaluate;
   5037     }
   5038 }
   5039 
   5040 static adv_host_t *
   5041 srpl_client_update_matches(dns_name_t *hostname, srpl_event_t *event)
   5042 {
   5043     adv_host_t *host = event->content.client_result.host;
   5044     if (event->content.client_result.rcode == dns_rcode_noerror && dns_names_equal_text(hostname, host->name)) {
   5045         INFO("returning host " PRI_S_SRP, host->name);
   5046         return host;
   5047     }
   5048     char name[kDNSServiceMaxDomainName];
   5049     dns_name_print(hostname, name, sizeof(name));
   5050     INFO("returning NULL: rcode = " PUB_S_SRP "  hostname = " PRI_S_SRP "  host->name = " PRI_S_SRP,
   5051          dns_rcode_name(event->content.client_result.rcode), name, host->name);
   5052     return NULL;
   5053 }
   5054 
   5055 // and wait for a srp_client_update_finished event for the host, which
   5056 // will trigger us to move to candidate_host_re_evaluate.
   5057 static srpl_state_t
   5058 srpl_candidate_host_contention_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5059 {
   5060     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5061     STATE_ANNOUNCE(srpl_connection, event);
   5062 
   5063     if (event == NULL) {
   5064         return srpl_state_invalid; // Wait for events.
   5065     } else if (event->event_type == srpl_event_srp_client_update_finished) {
   5066         adv_host_t *host = srpl_client_update_matches(srpl_connection->candidate->name, event);
   5067         if (host != NULL) {
   5068             srpl_connection->candidate->host = host;
   5069             srp_adv_host_retain(srpl_connection->candidate->host);
   5070             return srpl_state_candidate_host_re_evaluate;
   5071         }
   5072         return srpl_state_invalid; // Keep waiting
   5073     } else if (event->event_type == srpl_event_advertise_finished) {
   5074         // See if this is an event on the host we were waiting for.
   5075         if (event->content.advertise_finished.hostname != NULL &&
   5076             dns_names_equal_text(srpl_connection->candidate->name, event->content.advertise_finished.hostname))
   5077         {
   5078             return srpl_state_candidate_host_re_evaluate;
   5079         }
   5080         return srpl_state_invalid;
   5081     } else {
   5082         UNEXPECTED_EVENT(srpl_connection, event);
   5083     }
   5084 }
   5085 
   5086 // At this point we've either waited for the host to no longer be in contention, or else it wasn't in contention.
   5087 // There was a time gap between when we sent the candidate response and when the host message arrived, so an update
   5088 // may have arrived locally for that SRP client. We therefore re-evaluate at this point.
   5089 static srpl_state_t
   5090 srpl_candidate_host_re_evaluate_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5091 {
   5092     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5093     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5094     STATE_ANNOUNCE_NO_EVENTS_NAME(srpl_connection, srpl_connection->candidate->name);
   5095 
   5096     adv_host_t *host = srpl_connection->candidate->host;
   5097     // The host we retained may have become invalid; if so, discard it
   5098     if (host != NULL && !srp_adv_host_valid(host)) {
   5099         srp_adv_host_release(srpl_connection->candidate->host);
   5100         srpl_connection->candidate->host = host = NULL;
   5101     }
   5102     // If it was invalidated, or if we got here directly, look up the host by name
   5103     if (host == NULL) {
   5104         host = srp_adv_host_copy(srpl_connection->instance->domain->server_state,
   5105                                  srpl_connection->candidate->name);
   5106         srpl_connection->candidate->host = host;
   5107     }
   5108     // It's possible that the host is gone; in this case we definitely want the update.
   5109     if (host == NULL) {
   5110         return srpl_state_candidate_host_apply;
   5111     }
   5112 
   5113     // At this point we know that the host we were looking for is valid. Now check to see if we still want to apply it.
   5114     srpl_state_t ret = srpl_state_invalid;
   5115     srpl_candidate_disposition_t disposition = srpl_candidate_host_check(srpl_connection, host);
   5116     switch(disposition) {
   5117     case srpl_candidate_yes:
   5118         ret = srpl_state_candidate_host_apply;
   5119         break;
   5120     case srpl_candidate_no:
   5121         // This happens if we got a candidate and wanted it, but then got an SRP update on that candidate while waiting
   5122         // for events. In this case, there's no real problem, and the successful update should trigger an update to be
   5123         // sent to the remote.
   5124         srpl_host_response_send(srpl_connection, dns_rcode_noerror);
   5125         INFO("candidate_no: freeing parts");
   5126         srpl_host_update_parts_free(&srpl_connection->stashed_host);
   5127         ret = srpl_state_send_candidates_wait;
   5128         break;
   5129     case srpl_candidate_conflict:
   5130         srpl_host_response_send(srpl_connection, dns_rcode_yxdomain);
   5131         INFO("candidate_conflict: freeing parts");
   5132         srpl_host_update_parts_free(&srpl_connection->stashed_host);
   5133         ret = srpl_state_send_candidates_wait;
   5134         break;
   5135     }
   5136     return ret;
   5137 }
   5138 
   5139 static bool
   5140 srpl_connection_host_apply(srpl_connection_t *srpl_connection)
   5141 {
   5142     DNS_NAME_GEN_SRP(srpl_connection->stashed_host.hostname, name_buf);
   5143     INFO("applying update from " PRI_S_SRP " for host " PRI_DNS_NAME_SRP ", %d messages",
   5144          srpl_connection->name, DNS_NAME_PARAM_SRP(srpl_connection->stashed_host.hostname, name_buf),
   5145          srpl_connection->stashed_host.num_messages);
   5146     if (!srp_parse_host_messages_evaluate(srpl_connection->instance->domain->server_state, srpl_connection,
   5147                                           srpl_connection->stashed_host.messages,
   5148                                           srpl_connection->stashed_host.num_messages))
   5149     {
   5150         srpl_host_response_send(srpl_connection, dns_rcode_formerr);
   5151         return false;
   5152     }
   5153     return true;
   5154 }
   5155 
   5156 // At this point we know there is no contention on the host, and we want to update it, so start the update by passing the
   5157 // host message to dns_evaluate.
   5158 static srpl_state_t
   5159 srpl_candidate_host_apply_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5160 {
   5161     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5162     STATE_ANNOUNCE(srpl_connection, event);
   5163 
   5164     // Apply the host from the event to the current host list
   5165     // Return no event
   5166     // Note that we set host->srpl_connection _after_ we call dns_evaluate. This ensures that any "advertise_finished"
   5167     // calls that are done during the call to dns_evaluate do not deliver an event here.
   5168     if (event == NULL) {
   5169         if (!srpl_connection_host_apply(srpl_connection)) {
   5170             return srpl_state_send_candidates_wait;
   5171         }
   5172         return srpl_state_candidate_host_apply_wait;
   5173     } else if (event->event_type == srpl_event_advertise_finished) {
   5174         // This shouldn't be possible anymore, but I'm putting a FAULT in here in case I'm mistaken.
   5175         FAULT(PRI_S_SRP ": advertise_finished event!", srpl_connection->name);
   5176         return srpl_state_invalid;
   5177     } else {
   5178         UNEXPECTED_EVENT(srpl_connection, event);
   5179     }
   5180 }
   5181 
   5182 // Called by the SRP server when an advertise has finished for an update recevied on a connection.
   5183 static void
   5184 srpl_deferred_advertise_finished_event_deliver(void *context)
   5185 {
   5186     srpl_event_t *event = context;
   5187     srp_server_t *server_state = event->content.advertise_finished.server_state;
   5188     for (srpl_domain_t *domain = server_state->srpl_domains; domain != NULL; domain = domain->next) {
   5189         for (srpl_instance_t *instance = domain->instances; instance != NULL; instance = instance->next) {
   5190             if (instance->connection != NULL) {
   5191                 srpl_event_deliver(instance->connection, event);
   5192             }
   5193         }
   5194     }
   5195     for (srpl_instance_t *instance = server_state->unmatched_instances; instance != NULL; instance = instance->next) {
   5196         if (instance->connection != NULL) {
   5197             srpl_event_deliver(instance->connection, event);
   5198         }
   5199     }
   5200 
   5201     free(event->content.advertise_finished.hostname);
   5202     free(event);
   5203 }
   5204 
   5205 // Send an advertise_finished event for the specified hostname to all connections. Because this is called from
   5206 // advertise_finished, we do not want any state machine to advance immediately, so we defer delivery of this
   5207 // event until the next time we return to the main event loop.
   5208 void
   5209 srpl_advertise_finished_event_send(char *hostname, int rcode, srp_server_t *server_state)
   5210 {
   5211     srpl_event_t *event = calloc(1, sizeof(*event));
   5212     if (event == NULL) {
   5213         ERROR("No memory to defer advertise_finished event for " PUB_S_SRP, hostname);
   5214         return;
   5215     }
   5216 
   5217     srpl_event_initialize(event, srpl_event_advertise_finished);
   5218     event->content.advertise_finished.rcode = rcode;
   5219     event->content.advertise_finished.hostname = strdup(hostname);
   5220     event->content.advertise_finished.server_state = server_state;
   5221     if (event->content.advertise_finished.hostname == NULL) {
   5222         INFO(PRI_S_SRP ": no memory for hostname", hostname);
   5223         free(event);
   5224         return;
   5225     }
   5226     ioloop_run_async(srpl_deferred_advertise_finished_event_deliver, event);
   5227 }
   5228 
   5229 
   5230 // We enter this state to wait for the application of a host update to complete.
   5231 // We exit the state for the send_candidates_wait state when we receive an advertise_finished event.
   5232 // Additionally when we receive an advertise_finished event we send a "host" response with the rcode
   5233 // returned in the advertise_finished event.
   5234 static srpl_state_t
   5235 srpl_candidate_host_apply_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5236 {
   5237     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5238     STATE_ANNOUNCE(srpl_connection, event);
   5239 
   5240     if (event == NULL) {
   5241         return srpl_state_invalid; // Wait for events.
   5242     } else if (event->event_type == srpl_event_advertise_finished) {
   5243         srpl_host_response_send(srpl_connection, event->content.advertise_finished.rcode);
   5244         INFO("freeing parts");
   5245         srpl_host_update_parts_free(&srpl_connection->stashed_host);
   5246         return srpl_state_send_candidates_wait;
   5247     } else {
   5248         UNEXPECTED_EVENT(srpl_connection, event);
   5249     }
   5250 }
   5251 
   5252 // This marks the end of states that occur as a result of sending a "send candidates" message.
   5253 // This marks the beginning of states that occur as a result of receiving a send_candidates message.
   5254 
   5255 // We have received a "send candidates" message; the action is to create a candidates list.
   5256 static srpl_state_t
   5257 srpl_send_candidates_received_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5258 {
   5259     int num_candidates;
   5260     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5261     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5262     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   5263 
   5264     // Make sure we don't have a candidate list.
   5265     if (srpl_connection->candidates != NULL) {
   5266         srpl_connection_candidates_free(srpl_connection);
   5267         srpl_connection->candidates = NULL;
   5268         // Just in case we exit due to a failure...
   5269         srpl_connection->num_candidates = 0;
   5270         srpl_connection->current_candidate = -1;
   5271     }
   5272     // Generate a list of candidates from the current host list.
   5273     // Return no event
   5274     srp_server_t *server_state = srpl_connection->instance->domain->server_state;
   5275     num_candidates = srp_current_valid_host_count(server_state);
   5276     if (num_candidates > 0) {
   5277         adv_host_t **candidates = calloc(num_candidates, sizeof(*candidates));
   5278         int copied_candidates;
   5279         if (candidates == NULL) {
   5280             ERROR("unable to allocate candidates list.");
   5281             return srpl_connection_drop_state(srpl_connection->instance, srpl_connection);
   5282         }
   5283         copied_candidates = srp_hosts_to_array(server_state, candidates, num_candidates);
   5284         if (copied_candidates > num_candidates) {
   5285             FAULT("copied_candidates %d > num_candidates %d",
   5286                   copied_candidates, num_candidates);
   5287             return srpl_connection_drop_state(srpl_connection->instance, srpl_connection);
   5288         }
   5289         if (num_candidates != copied_candidates) {
   5290             INFO("srp_hosts_to_array returned the wrong number of hosts: copied_candidates %d > num_candidates %d",
   5291                  copied_candidates, num_candidates);
   5292             num_candidates = copied_candidates;
   5293         }
   5294         srpl_connection->candidates = candidates;
   5295     }
   5296     srpl_connection->candidates_not_generated = false;
   5297     srpl_connection->num_candidates = num_candidates;
   5298     srpl_connection->current_candidate = -1;
   5299     return srpl_state_send_candidates_remaining_check;
   5300 }
   5301 
   5302 // See if there are candidates remaining; if not, send "send candidates" response.
   5303 static srpl_state_t
   5304 srpl_candidates_remaining_check_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5305 {
   5306     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5307     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5308     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   5309 
   5310     // Get the next candidate out of the candidate list
   5311     // Return "no candidates left" or "next candidate"
   5312     if (srpl_connection->current_candidate + 1 < srpl_connection->num_candidates) {
   5313         srpl_connection->current_candidate++;
   5314         return srpl_state_next_candidate_send;
   5315     } else {
   5316         return srpl_state_send_candidates_response_send;
   5317     }
   5318 }
   5319 
   5320 // Send the next candidate.
   5321 static srpl_state_t
   5322 srpl_next_candidate_send_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5323 {
   5324     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5325     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5326     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   5327 
   5328     srpl_candidate_message_send(srpl_connection, srpl_connection->candidates[srpl_connection->current_candidate]);
   5329     return srpl_state_next_candidate_send_wait;
   5330 }
   5331 
   5332 // Wait for a "candidate" response.
   5333 static srpl_state_t
   5334 srpl_next_candidate_send_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5335 {
   5336     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5337     STATE_ANNOUNCE(srpl_connection, event);
   5338 
   5339     if (event == NULL) {
   5340         return srpl_state_invalid; // Wait for events.
   5341     } else if (event->event_type == srpl_event_candidate_response_received) {
   5342         switch (event->content.disposition) {
   5343         case srpl_candidate_yes:
   5344             return srpl_state_candidate_host_send;
   5345         case srpl_candidate_no:
   5346         case srpl_candidate_conflict:
   5347             return srpl_state_send_candidates_remaining_check;
   5348         }
   5349         return srpl_state_invalid;
   5350     } else {
   5351         UNEXPECTED_EVENT(srpl_connection, event);
   5352     }
   5353 }
   5354 
   5355 // Send the host for the candidate.
   5356 static srpl_state_t
   5357 srpl_candidate_host_send_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5358 {
   5359     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5360     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5361     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   5362 
   5363     // It's possible that the host that we put on the candidates list has become invalid. If so, just go back and send
   5364     // the next candidate (or finish).
   5365     adv_host_t *host = srpl_connection->candidates[srpl_connection->current_candidate];
   5366     if (!srp_adv_host_valid(host) || host->message == NULL) {
   5367         return srpl_state_send_candidates_remaining_check;
   5368     }
   5369     if (!srpl_host_message_send(srpl_connection, host)) {
   5370         srpl_disconnect(srpl_connection);
   5371         return srpl_state_invalid;
   5372     }
   5373     return srpl_state_candidate_host_response_wait;
   5374 }
   5375 
   5376 // Wait for a "host" response.
   5377 static srpl_state_t
   5378 srpl_candidate_host_response_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5379 {
   5380     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5381     STATE_ANNOUNCE(srpl_connection, event);
   5382 
   5383     if (event == NULL) {
   5384         return srpl_state_invalid; // Wait for events.
   5385     } else if (event->event_type == srpl_event_host_response_received) {
   5386         // The only failure case we care about is a conflict, and we don't have a way to handle that, so just
   5387         // continue without checking the status.
   5388         return srpl_state_send_candidates_remaining_check;
   5389     } else {
   5390         UNEXPECTED_EVENT(srpl_connection, event);
   5391     }
   5392 }
   5393 
   5394 // At this point we're done sending candidates, so we send a "send candidates" response.
   5395 static srpl_state_t
   5396 srpl_send_candidates_response_send_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5397 {
   5398     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5399     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5400     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   5401 
   5402     srpl_send_candidates_message_send(srpl_connection, true);
   5403     // When the server has sent its candidate response, it's immediately ready to send a "send candidate" message
   5404     // When the client has sent its candidate response, the database synchronization is done on the client.
   5405     if (srpl_connection->is_server) {
   5406         return srpl_state_send_candidates_send;
   5407     } else {
   5408         srpl_domain_t *domain = srpl_connection_domain(srpl_connection);
   5409         srpl_connection->database_synchronized = true;
   5410         srpl_instance_t *instance = srpl_connection->instance;
   5411         instance->sync_fail = false;
   5412         srpl_state_transition_by_dataset_id(domain, instance);
   5413         return srpl_state_ready;
   5414     }
   5415 }
   5416 
   5417 // The ready state is where we land when there's no remaining work to do. We wait for events, and when we get one,
   5418 // we handle it, ultimately returning to this state.
   5419 static srpl_state_t
   5420 srpl_ready_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5421 {
   5422     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5423     STATE_ANNOUNCE(srpl_connection, event);
   5424 
   5425     if (event == NULL) {
   5426         // Whenever we newly land in this state, see if there is an unsent client update at the head of the
   5427         // queue, and if so, send it.
   5428         if (srpl_connection->client_update_queue != NULL && !srpl_connection->client_update_queue->sent) {
   5429             adv_host_t *host = srpl_connection->client_update_queue->host;
   5430             if (host == NULL || host->name == NULL) {
   5431                 INFO(PRI_S_SRP ": we have an update to send for bogus host %p.", srpl_connection->name, host);
   5432             } else {
   5433                 INFO(PRI_S_SRP ": we have an update to send for host " PRI_S_SRP, srpl_connection->name,
   5434                      srpl_connection->client_update_queue->host->name);
   5435             }
   5436             return srpl_state_srp_client_update_send;
   5437         } else {
   5438             if (srpl_connection->client_update_queue != NULL) {
   5439                 adv_host_t *host = srpl_connection->client_update_queue->host;
   5440                 if (host == NULL || host->name == NULL) {
   5441                     INFO(PRI_S_SRP ": there is anupdate that's marked sent for bogus host %p.",
   5442                          srpl_connection->name, host);
   5443                 } else {
   5444                     INFO(PRI_S_SRP ": there is an update on the queue that's marked sent for host " PRI_S_SRP,
   5445                          srpl_connection->name, host->name);
   5446                 }
   5447             } else {
   5448                 INFO(PRI_S_SRP ": the client update queue is empty.", srpl_connection->name);
   5449             }
   5450         }
   5451         return srpl_state_invalid;
   5452     } else if (event->event_type == srpl_event_host_message_received) {
   5453         if (srpl_connection->stashed_host.messages != NULL) {
   5454             FAULT(PRI_S_SRP ": stashed host present but host message received", srpl_connection->name);
   5455             return srpl_connection_drop_state(srpl_connection->instance, srpl_connection);
   5456         }
   5457         // Copy the update information, retain what's refcounted, and NULL out what's not, on the event.
   5458         srpl_host_update_steal_parts(&srpl_connection->stashed_host, &event->content.host_update);
   5459         return srpl_state_stashed_host_check;
   5460     } else if (event->event_type == srpl_event_host_response_received) {
   5461         return srpl_state_srp_client_ack_evaluate;
   5462     } else if (event->event_type == srpl_event_advertise_finished) {
   5463         if (srpl_connection->stashed_host.hostname != NULL &&
   5464             event->content.advertise_finished.hostname != NULL &&
   5465             dns_names_equal_text(srpl_connection->stashed_host.hostname, event->content.advertise_finished.hostname))
   5466         {
   5467             srpl_connection->stashed_host.rcode = event->content.advertise_finished.rcode;
   5468             return srpl_state_stashed_host_finished;
   5469         }
   5470         return srpl_state_invalid;
   5471     } else if (event->event_type == srpl_event_srp_client_update_finished) {
   5472         // When we receive a client update in ready state, we just need to re-run the state's action.
   5473         return srpl_state_ready;
   5474     } else {
   5475         UNEXPECTED_EVENT(srpl_connection, event);
   5476     }
   5477 }
   5478 
   5479 // We get here when there is at least one client update queued up to send
   5480 static srpl_state_t
   5481 srpl_srp_client_update_send_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5482 {
   5483     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5484     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5485     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   5486 
   5487     srpl_srp_client_queue_entry_t *update = srpl_connection->client_update_queue;
   5488     if (update != NULL) {
   5489         // If the host has a message, send it. Note that this host may well be removed, but if it had been removed
   5490         // through a lease expiry we wouldn't have got here, because the host object would have been removed from
   5491         // the list. So if it has a message attached to it, that means that either it's been removed explicitly by
   5492         // the client, which we need to propagate, or else it is still valid, and so we need to propagate the most
   5493         // recent update we got.
   5494         if (update->host->message != NULL) {
   5495             srpl_host_message_send(srpl_connection, update->host);
   5496             update->sent = true;
   5497         } else {
   5498             ERROR(PRI_S_SRP ": no host message to send for host " PRI_S_SRP ".",
   5499                   srpl_connection->name, update->host->name);
   5500 
   5501             // We're not going to send this update, so take it off the queue.
   5502             srpl_connection->client_update_queue = update->next;
   5503             srp_adv_host_release(update->host);
   5504             free(update);
   5505         }
   5506     }
   5507     return srpl_state_ready;
   5508 }
   5509 
   5510 // We go here when we get a "host" response; all we do is remove the host from the top of the queue.
   5511 static srpl_state_t
   5512 srpl_srp_client_ack_evaluate_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5513 {
   5514     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5515     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5516     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   5517 
   5518     if (srpl_connection->client_update_queue == NULL) {
   5519         FAULT(PRI_S_SRP ": update queue empty in ready, but host_response_received event received.",
   5520               srpl_connection->name);
   5521         return srpl_connection_drop_state(srpl_connection->instance, srpl_connection);
   5522     }
   5523     if (!srpl_connection->client_update_queue->sent) {
   5524         FAULT(PRI_S_SRP ": top of update queue not sent, but host_response_received event received.",
   5525               srpl_connection->name);
   5526         return srpl_connection_drop_state(srpl_connection->instance, srpl_connection);
   5527     }
   5528     srpl_srp_client_queue_entry_t *finished_update = srpl_connection->client_update_queue;
   5529     srpl_connection->client_update_queue = finished_update->next;
   5530     if (finished_update->host != NULL) {
   5531         srp_adv_host_release(finished_update->host);
   5532     }
   5533     free(finished_update);
   5534     return srpl_state_ready;
   5535 }
   5536 
   5537 // We go here when we get a "host" message
   5538 static srpl_state_t
   5539 srpl_stashed_host_check_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5540 {
   5541     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5542     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5543     STATE_ANNOUNCE_NO_EVENTS_NAME(srpl_connection, srpl_connection->stashed_host.hostname);
   5544 
   5545     adv_host_t *host = srp_adv_host_copy(srpl_connection->instance->domain->server_state,
   5546                                          srpl_connection->stashed_host.hostname);
   5547     // No contention...
   5548     if (host == NULL) {
   5549         INFO("applying host because it doesn't exist locally.");
   5550         return srpl_state_stashed_host_apply;
   5551     } else if (host->update == NULL && host->srpl_connection == NULL) {
   5552         INFO("applying host because there's no contention.");
   5553         srp_adv_host_release(host);
   5554         return srpl_state_stashed_host_apply;
   5555     } else {
   5556         INFO("not applying host because there is contention. host->update %p   host->srpl_connection: %p",
   5557              host->update, host->srpl_connection);
   5558     }
   5559     srp_adv_host_release(host);
   5560     return srpl_state_ready; // Wait for something to happen
   5561 }
   5562 
   5563 // We go here when we have a stashed host to apply.
   5564 static srpl_state_t
   5565 srpl_stashed_host_apply_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5566 {
   5567     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5568     STATE_ANNOUNCE(srpl_connection, event);
   5569 
   5570     if (event == NULL) {
   5571         if (!srpl_connection_host_apply(srpl_connection)) {
   5572             srpl_connection->stashed_host.rcode = dns_rcode_servfail;
   5573             return srpl_state_stashed_host_finished;
   5574         }
   5575         return srpl_state_ready; // Wait for something to happen
   5576     } else if (event->event_type == srpl_event_advertise_finished) {
   5577         // This shouldn't be possible anymore, but I'm putting a FAULT in here in case I'm mistaken.
   5578         FAULT(PRI_S_SRP ": advertise_finished event!", srpl_connection->name);
   5579         return srpl_state_invalid;
   5580     } else {
   5581         UNEXPECTED_EVENT(srpl_connection, event);
   5582     }
   5583 }
   5584 
   5585 // We go here when a host update advertise finishes.
   5586 static srpl_state_t
   5587 srpl_stashed_host_finished_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5588 {
   5589     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5590     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5591     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   5592 
   5593     if (srpl_connection->stashed_host.hostname == NULL) {
   5594         FAULT(PRI_S_SRP ": stashed host not present, but advertise_finished event received.", srpl_connection->name);
   5595         return srpl_state_ready;
   5596     }
   5597     if (srpl_connection->stashed_host.messages == NULL) {
   5598         FAULT(PRI_S_SRP ": stashed host present, no messages.", srpl_connection->name);
   5599         return srpl_state_ready;
   5600     }
   5601     srpl_host_response_send(srpl_connection, srpl_connection->stashed_host.rcode);
   5602     INFO("freeing parts");
   5603     srpl_host_update_parts_free(&srpl_connection->stashed_host);
   5604     return srpl_state_ready;
   5605 }
   5606 
   5607 // We land here immediately after a server connection is received.
   5608 static srpl_state_t
   5609 srpl_session_message_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5610 {
   5611     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5612     STATE_ANNOUNCE(srpl_connection, event);
   5613 
   5614     if (event == NULL) {
   5615         return srpl_state_invalid; // Wait for events.
   5616     } else if (event->event_type == srpl_event_session_message_received) {
   5617         // we check the remote_version. if it's smaller than our current version, we note
   5618         // version_mismatch on the instance and move to the idle state. Since no response
   5619         // is sent out, the remote peer will stay in the session_response_wait.
   5620         if (event->content.session.remote_version < SRPL_CURRENT_VERSION) { // version mismatch
   5621             INFO("instance " PRI_S_SRP " has a lower version number", srpl_connection->instance->instance_name);
   5622             srpl_connection->instance->version_mismatch = true;
   5623             return srpl_state_idle;
   5624         }
   5625         srpl_connection->remote_partner_id = event->content.session.partner_id;
   5626         srpl_connection->new_partner = event->content.session.new_partner;
   5627         return srpl_state_session_evaluate;
   5628     } else {
   5629         UNEXPECTED_EVENT(srpl_connection, event);
   5630     }
   5631 }
   5632 
   5633 // Send a session response
   5634 static srpl_state_t
   5635 srpl_session_response_send(srpl_connection_t *UNUSED srpl_connection, srpl_event_t *event)
   5636 {
   5637     REQUIRE_SRPL_EVENT_NULL(srpl_connection, event);
   5638     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5639     STATE_ANNOUNCE_NO_EVENTS(srpl_connection);
   5640 
   5641     if (!srpl_session_message_send(srpl_connection, true)) {
   5642         return srpl_state_disconnect;
   5643     }
   5644     return srpl_state_send_candidates_message_wait;
   5645 }
   5646 
   5647 // We land here immediately after a server connection is received.
   5648 static srpl_state_t
   5649 srpl_send_candidates_message_wait_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5650 {
   5651     REQUIRE_SRPL_INSTANCE(srpl_connection);
   5652     STATE_ANNOUNCE(srpl_connection, event);
   5653 
   5654     if (event == NULL) {
   5655         return srpl_state_invalid; // Wait for events.
   5656     } else if (event->event_type == srpl_event_send_candidates_message_received) {
   5657         return srpl_state_send_candidates_received;
   5658     } else {
   5659         UNEXPECTED_EVENT(srpl_connection, event);
   5660     }
   5661 }
   5662 
   5663 #ifdef SRP_TEST_SERVER
   5664 // When testing, we may want an srpl_connection_t that just calls back to the test system when an event
   5665 // is delivered and otherwise does nothing.
   5666 static srpl_state_t
   5667 srpl_test_event_intercept_action(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5668 {
   5669     STATE_ANNOUNCE(srpl_connection, event);
   5670 
   5671     return test_packet_srpl_intercept(srpl_connection, event);
   5672 }
   5673 #endif
   5674 
   5675 // Check to see if host is on the list of remaining candidates to send. If so, no need to do anything--it'll go out soon.
   5676 static bool
   5677 srpl_reschedule_candidate(srpl_connection_t *srpl_connection, adv_host_t *host)
   5678 {
   5679     // We don't need to queue new updates if we haven't yet generated a candidates list.
   5680     if (srpl_connection->candidates_not_generated) {
   5681         INFO("returning true because we haven't generated candidates.");
   5682         return true;
   5683     }
   5684     if (srpl_connection->candidates == NULL) {
   5685         INFO("returning false because we have no candidates.");
   5686         return false;
   5687     }
   5688     for (int i = srpl_connection->current_candidate + 1; i < srpl_connection->num_candidates; i++) {
   5689         if (srpl_connection->candidates[i] == host) {
   5690             INFO("returning true because the host is on the candidate list.");
   5691             return true;
   5692         }
   5693     }
   5694     INFO("returning false because the host is not on the candidate list.");
   5695     return false;
   5696 }
   5697 
   5698 static void
   5699 srpl_queue_srp_client_update(srpl_connection_t *srpl_connection, adv_host_t *host)
   5700 {
   5701     srpl_srp_client_queue_entry_t *new_entry, **qp;
   5702     // Find the end of the queue
   5703     for (qp = &srpl_connection->client_update_queue; *qp; qp = &(*qp)->next) {
   5704         srpl_srp_client_queue_entry_t *entry = *qp;
   5705         // No need to re-queue if we're already on the queue
   5706         if (!entry->sent && entry->host == host) {
   5707             INFO("host " PRI_S_SRP " is already on the update queue for connection " PRI_S_SRP,
   5708                  host->name, srpl_connection->name);
   5709             return;
   5710         }
   5711     }
   5712     new_entry = calloc(1, sizeof(*new_entry));
   5713     if (new_entry == NULL) {
   5714         ERROR(PRI_S_SRP ": no memory to queue SRP client update.", srpl_connection->name);
   5715         return;
   5716     }
   5717     INFO("adding host " PRI_S_SRP " to the update queue for connection " PRI_S_SRP, host->name, srpl_connection->name);
   5718     new_entry->host = host;
   5719     srp_adv_host_retain(new_entry->host);
   5720     *qp = new_entry;
   5721 }
   5722 
   5723 // Client update events are interesting in two cases. First, we might have received a host update for a
   5724 // host that was in contention when the update was received; in this case, we want to now apply the update,
   5725 // assuming that the contention is no longer present (it's possible that there are multiple sources of
   5726 // contention).
   5727 //
   5728 // The second case is where a client update succeeded; in this case we want to send that update to all of
   5729 // the remotes.
   5730 //
   5731 // We do not receive this event when an update that was triggered by an SRP Replication update; in that
   5732 // case we get an "apply finished" event instead of a "client update finished" event.
   5733 static void
   5734 srpl_srp_client_update_send_event_to_connection(srpl_connection_t *srpl_connection, srpl_event_t *event)
   5735 {
   5736     if (event->content.client_result.rcode == dns_rcode_noerror) {
   5737         adv_host_t *host = event->content.client_result.host;
   5738         if (!srpl_reschedule_candidate(srpl_connection, host)) {
   5739             srpl_queue_srp_client_update(srpl_connection, host);
   5740         }
   5741     }
   5742     srpl_event_deliver(srpl_connection, event);
   5743 }
   5744 
   5745 static void
   5746 srpl_deferred_srp_client_update_finished_event_deliver(void *context)
   5747 {
   5748     srpl_event_t *event = context;
   5749     srp_server_t *server_state = event->content.client_result.host->server_state;
   5750     if (server_state == NULL) {
   5751         FAULT("server state is NULL."); // this can't currently happen, because we just finished updating the host.
   5752         goto out;
   5753     }
   5754     for (srpl_domain_t *domain = server_state->srpl_domains; domain != NULL; domain = domain->next) {
   5755         for (srpl_instance_t *instance = domain->instances; instance != NULL; instance = instance->next) {
   5756             if (instance->connection != NULL) {
   5757                 srpl_srp_client_update_send_event_to_connection(instance->connection, event);
   5758             }
   5759         }
   5760     }
   5761     for (srpl_instance_t *instance = server_state->unmatched_instances; instance != NULL; instance = instance->next) {
   5762         if (instance->connection != NULL) {
   5763             srpl_srp_client_update_send_event_to_connection(instance->connection, event);
   5764         }
   5765     }
   5766 out:
   5767     srp_adv_host_release(event->content.client_result.host);
   5768     free(event);
   5769 }
   5770 
   5771 // When an SRP client update finished, we need to deliver an event to all connections indicating that this has
   5772 // occurred. This event must be delivered from the main run loop, to avoid starting an update before advertise_finish
   5773 // has completed its work.
   5774 void
   5775 srpl_srp_client_update_finished_event_send(adv_host_t *host, int rcode)
   5776 {
   5777     srpl_event_t *event;
   5778     event = malloc(sizeof(*event));
   5779     if (event == NULL) {
   5780         FAULT(PRI_S_SRP ": unable to allocate memory to defer event", host->name);
   5781         return;
   5782     }
   5783     srpl_event_initialize(event, srpl_event_srp_client_update_finished);
   5784     srpl_event_content_type_set(event, srpl_event_content_type_client_result);
   5785     event->content.client_result.host = host;
   5786     srp_adv_host_retain(event->content.client_result.host);
   5787     event->content.client_result.rcode = rcode;
   5788     ioloop_run_async(srpl_deferred_srp_client_update_finished_event_deliver, event);
   5789 }
   5790 
   5791 typedef struct {
   5792     srpl_state_t state;
   5793     char *name;
   5794     srpl_action_t action;
   5795 } srpl_connection_state_t;
   5796 
   5797 #define STATE_NAME_DECL(name) srpl_state_##name, #name
   5798 static srpl_connection_state_t srpl_connection_states[] = {
   5799     { STATE_NAME_DECL(invalid),                              NULL },
   5800     { STATE_NAME_DECL(disconnected),                         srpl_disconnected_action },
   5801     { STATE_NAME_DECL(next_address_get),                     srpl_next_address_get_action },
   5802     { STATE_NAME_DECL(connect),                              srpl_connect_action },
   5803     { STATE_NAME_DECL(idle),                                 srpl_idle_action },
   5804     { STATE_NAME_DECL(reconnect_wait),                       srpl_reconnect_wait_action },
   5805     { STATE_NAME_DECL(retry_delay_send),                     srpl_retry_delay_send_action },
   5806     { STATE_NAME_DECL(disconnect),                           srpl_disconnect_action },
   5807     { STATE_NAME_DECL(disconnect_wait),                      srpl_disconnect_wait_action },
   5808 // If a disconnected state is added here, please fix SRPL_CONNECTION_IS_CONNECTED above.
   5809 // connecting is counted as a connected state because we have a connection, even if it is not
   5810 // actually connected, and we'll get a disconnect if it fails, so we aren't stuck.
   5811     { STATE_NAME_DECL(connecting),                           srpl_connecting_action },
   5812     { STATE_NAME_DECL(session_send),                         srpl_session_send_action },
   5813     { STATE_NAME_DECL(session_response_wait),                srpl_session_response_wait_action },
   5814     { STATE_NAME_DECL(session_evaluate),                     srpl_session_evaluate_action },
   5815     { STATE_NAME_DECL(sync_wait),                            srpl_sync_wait_action },
   5816     // Here we are the endpoint that has send the "send candidates message" and we are cycling through the candidates
   5817     // we receive until we get a "send candidates" reply.
   5818 
   5819     { STATE_NAME_DECL(send_candidates_send),                 srpl_send_candidates_send_action },
   5820     { STATE_NAME_DECL(send_candidates_wait),                 srpl_send_candidates_wait_action },
   5821 
   5822     // Got a "candidate" message, need to check it and send the right reply.
   5823     { STATE_NAME_DECL(candidate_check),                      srpl_candidate_check_action },
   5824 
   5825     // At this point we've send a candidate reply, so we're waiting for a host message. It's possible that the host
   5826     // went away in the interim, in which case we will get a "candidate" message or a "send candidate" reply.
   5827 
   5828     { STATE_NAME_DECL(candidate_host_wait),                  srpl_candidate_host_wait_action },
   5829     { STATE_NAME_DECL(candidate_host_prepare),               srpl_candidate_host_prepare_action },
   5830     { STATE_NAME_DECL(candidate_host_contention_wait),       srpl_candidate_host_contention_wait_action },
   5831     { STATE_NAME_DECL(candidate_host_re_evaluate),           srpl_candidate_host_re_evaluate_action },
   5832 
   5833     // Here we've gotten the host message (the SRP message), and need to apply it and send a response
   5834     { STATE_NAME_DECL(candidate_host_apply),                 srpl_candidate_host_apply_action },
   5835     { STATE_NAME_DECL(candidate_host_apply_wait),            srpl_candidate_host_apply_wait_action },
   5836 
   5837     // We've received a "send candidates" message. Make a list of candidates to send, and then start sending them.
   5838     { STATE_NAME_DECL(send_candidates_received),             srpl_send_candidates_received_action },
   5839     // See if there are any candidates left to send; if not, go to send_candidates_response_send
   5840     { STATE_NAME_DECL(send_candidates_remaining_check),      srpl_candidates_remaining_check_action },
   5841     // Send a "candidate" message for the next candidate
   5842     { STATE_NAME_DECL(next_candidate_send),                  srpl_next_candidate_send_action },
   5843     // Wait for a response to the "candidate" message
   5844     { STATE_NAME_DECL(next_candidate_send_wait),             srpl_next_candidate_send_wait_action },
   5845     // The candidate requested, so send its host info
   5846     { STATE_NAME_DECL(candidate_host_send),                  srpl_candidate_host_send_action },
   5847     // We're waiting for the remote to acknowledge the host update
   5848     { STATE_NAME_DECL(candidate_host_response_wait),         srpl_candidate_host_response_wait_action },
   5849 
   5850     // When we've run out of candidates to send, we send the candidates response.
   5851     { STATE_NAME_DECL(send_candidates_response_send),        srpl_send_candidates_response_send_action },
   5852 
   5853     // This is the quiescent state for servers and clients after session establishment database sync.
   5854     // Waiting for updates received locally, or updates sent by remote
   5855     { STATE_NAME_DECL(ready),                                srpl_ready_action },
   5856     // An update was received locally
   5857     { STATE_NAME_DECL(srp_client_update_send),               srpl_srp_client_update_send_action },
   5858     // We've gotten an ack
   5859     { STATE_NAME_DECL(srp_client_ack_evaluate),              srpl_srp_client_ack_evaluate_action },
   5860     // See if we have an update from the remote that we stashed because it arrived while we were sending one
   5861     { STATE_NAME_DECL(stashed_host_check),                   srpl_stashed_host_check_action },
   5862     // Apply a stashed update (which may have been stashed in the ready state or the client_update_ack_wait state
   5863     { STATE_NAME_DECL(stashed_host_apply),                   srpl_stashed_host_apply_action },
   5864     // A stashed update finished; check the results
   5865     { STATE_NAME_DECL(stashed_host_finished),                srpl_stashed_host_finished_action },
   5866 
   5867     // Initial startup state for server
   5868     { STATE_NAME_DECL(session_message_wait),                 srpl_session_message_wait_action },
   5869     // Send a response once we've figured out that we're going to continue
   5870     { STATE_NAME_DECL(session_response_send),                srpl_session_response_send },
   5871     // Wait for a "send candidates" message.
   5872     { STATE_NAME_DECL(send_candidates_message_wait),         srpl_send_candidates_message_wait_action },
   5873 
   5874 #ifdef SRP_TEST_SERVER
   5875     { STATE_NAME_DECL(test_event_intercept),                 srpl_test_event_intercept_action },
   5876 #endif
   5877 };
   5878 #define SRPL_NUM_CONNECTION_STATES (sizeof(srpl_connection_states) / sizeof(srpl_connection_state_t))
   5879 
   5880 static srpl_connection_state_t *
   5881 srpl_state_get(srpl_state_t state)
   5882 {
   5883     static bool once = false;
   5884     if (!once) {
   5885         for (unsigned i = 0; i < SRPL_NUM_CONNECTION_STATES; i++) {
   5886             if (srpl_connection_states[i].state != (srpl_state_t)i) {
   5887                 ERROR("srpl connection state %d doesn't match " PUB_S_SRP, i, srpl_connection_states[i].name);
   5888                 STATE_DEBUGGING_ABORT();
   5889                 return NULL;
   5890             }
   5891         }
   5892         once = true;
   5893     }
   5894     if (state < 0 || state >= SRPL_NUM_CONNECTION_STATES) {
   5895         STATE_DEBUGGING_ABORT();
   5896         return NULL;
   5897     }
   5898     return &srpl_connection_states[state];
   5899 }
   5900 
   5901 void
   5902 srpl_connection_next_state(srpl_connection_t *srpl_connection, srpl_state_t state)
   5903 {
   5904     srpl_state_t next_state = state;
   5905 
   5906     do {
   5907         srpl_connection_state_t *new_state = srpl_state_get(next_state);
   5908 
   5909         if (new_state == NULL) {
   5910             ERROR(PRI_S_SRP " next state is invalid: %d", srpl_connection->name, next_state);
   5911             STATE_DEBUGGING_ABORT();
   5912             return;
   5913         }
   5914         srpl_connection->state = next_state;
   5915         srpl_connection->state_name = new_state->name;
   5916         srpl_connection->state_start_time = srp_time();
   5917         srpl_action_t action = new_state->action;
   5918         if (action != NULL) {
   5919             next_state = action(srpl_connection, NULL);
   5920         }
   5921     } while (next_state != srpl_state_invalid);
   5922 }
   5923 
   5924 //
   5925 // Event functions
   5926 //
   5927 
   5928 typedef struct {
   5929     srpl_event_type_t event_type;
   5930     char *name;
   5931 } srpl_event_configuration_t;
   5932 
   5933 #define EVENT_NAME_DECL(name) { srpl_event_##name, #name }
   5934 
   5935 srpl_event_configuration_t srpl_event_configurations[] = {
   5936     EVENT_NAME_DECL(invalid),
   5937     EVENT_NAME_DECL(address_add),
   5938     EVENT_NAME_DECL(address_remove),
   5939     EVENT_NAME_DECL(server_disconnect),
   5940     EVENT_NAME_DECL(reconnect_timer_expiry),
   5941     EVENT_NAME_DECL(disconnected),
   5942     EVENT_NAME_DECL(connected),
   5943     EVENT_NAME_DECL(session_response_received),
   5944     EVENT_NAME_DECL(send_candidates_response_received),
   5945     EVENT_NAME_DECL(candidate_received),
   5946     EVENT_NAME_DECL(host_message_received),
   5947     EVENT_NAME_DECL(srp_client_update_finished),
   5948     EVENT_NAME_DECL(advertise_finished),
   5949     EVENT_NAME_DECL(candidate_response_received),
   5950     EVENT_NAME_DECL(host_response_received),
   5951     EVENT_NAME_DECL(session_message_received),
   5952     EVENT_NAME_DECL(send_candidates_message_received),
   5953     EVENT_NAME_DECL(do_sync),
   5954 };
   5955 #define SRPL_NUM_EVENT_TYPES (sizeof(srpl_event_configurations) / sizeof(srpl_event_configuration_t))
   5956 
   5957 static srpl_event_configuration_t *
   5958 srpl_event_configuration_get(srpl_event_type_t event)
   5959 {
   5960     static bool once = false;
   5961     if (!once) {
   5962         for (unsigned i = 0; i < SRPL_NUM_EVENT_TYPES; i++) {
   5963             if (srpl_event_configurations[i].event_type != (srpl_event_type_t)i) {
   5964                 ERROR("srpl connection event %d doesn't match " PUB_S_SRP, i, srpl_event_configurations[i].name);
   5965                 STATE_DEBUGGING_ABORT();
   5966                 return NULL;
   5967             }
   5968         }
   5969         once = true;
   5970     }
   5971     if (event < 0 || event >= SRPL_NUM_EVENT_TYPES) {
   5972         STATE_DEBUGGING_ABORT();
   5973         return NULL;
   5974     }
   5975     return &srpl_event_configurations[event];
   5976 }
   5977 
   5978 static const char *
   5979 srpl_state_name(srpl_state_t state)
   5980 {
   5981     for (unsigned i = 0; i < SRPL_NUM_CONNECTION_STATES; i++) {
   5982         if (srpl_connection_states[i].state == state) {
   5983             return srpl_connection_states[i].name;
   5984         }
   5985     }
   5986     return "unknown state";
   5987 }
   5988 
   5989 void
   5990 srpl_dump_connection_states(srp_server_t *server_state)
   5991 {
   5992     srpl_domain_t *domain;
   5993     srpl_instance_t *instance;
   5994     srpl_connection_t *connection;
   5995     uint32_t days, hours, minutes, seconds;
   5996 
   5997     for (domain = server_state->srpl_domains; domain != NULL; domain = domain->next) {
   5998         INFO("srpl connections in domain " PRI_S_SRP ":", domain->name);
   5999         for (instance = domain->instances; instance != NULL; instance = instance->next) {
   6000             connection = instance->connection;
   6001             if (connection != NULL) {
   6002                 seconds = (uint32_t)(srp_time() - connection->state_start_time);
   6003                 days = seconds / SECONDS_IN_DAY;
   6004                 seconds -= days * SECONDS_IN_DAY;
   6005                 hours = seconds / SECONDS_IN_HOUR;
   6006                 seconds -= hours * SECONDS_IN_HOUR;
   6007                 minutes = seconds / SECONDS_IN_MINUTE;
   6008                 seconds -= minutes * SECONDS_IN_MINUTE;
   6009                 INFO(PUB_S_SRP "connected to srpl instance " PRI_S_SRP " - connection " PRI_S_SRP
   6010                      ", in state " PUB_S_SRP " for %" PRIu32 " days %" PRIu32 " hours %" PRIu32 " minutes %" PRIu32 " seconds",
   6011                      connection->state > srpl_state_connecting ? "" : "not yet ",
   6012                      instance->instance_name, connection->name, srpl_state_name(connection->state),
   6013                      days, hours, minutes, seconds);
   6014             } else {
   6015                 INFO("no connection to srpl instance " PRI_S_SRP PUB_S_SRP, instance->instance_name,
   6016                      instance->is_me ? ", is_me" : "");
   6017             }
   6018         }
   6019     }
   6020 
   6021     for (instance = server_state->unmatched_instances; instance != NULL; instance = instance->next) {
   6022         connection = instance->connection;
   6023         if (connection != NULL) {
   6024             seconds = (uint32_t)(srp_time() - connection->state_start_time);
   6025             days = seconds / SECONDS_IN_DAY;
   6026             seconds -= days * SECONDS_IN_DAY;
   6027             hours = seconds / SECONDS_IN_HOUR;
   6028             seconds -= hours * SECONDS_IN_HOUR;
   6029             minutes = seconds / SECONDS_IN_MINUTE;
   6030             seconds -= minutes * SECONDS_IN_MINUTE;
   6031             INFO(PUB_S_SRP "connected to unidentified instance " PRI_S_SRP " - connection " PRI_S_SRP
   6032                  ", in state " PUB_S_SRP " for %" PRIu32 " days %" PRIu32 " hours %" PRIu32 " minutes %" PRIu32 " seconds",
   6033                  connection->state > srpl_state_connecting ? "" : "not yet ",
   6034                  instance->instance_name, connection->name, srpl_state_name(connection->state),
   6035                  days, hours, minutes, seconds);
   6036         }
   6037     }
   6038 }
   6039 
   6040 void
   6041 srpl_change_server_priority(srp_server_t *server_state, uint32_t new)
   6042 {
   6043     if (server_state != NULL && new != server_state->priority) {
   6044         server_state->priority = new;
   6045         for (srpl_domain_t *domain = server_state->srpl_domains; domain != NULL; domain = domain->next) {
   6046             // priority changed. re-advertise.
   6047             srpl_domain_advertise(domain);
   6048         }
   6049     }
   6050 }
   6051 
   6052 static void
   6053 srpl_event_initialize(srpl_event_t *event, srpl_event_type_t event_type)
   6054 {
   6055     memset(event, 0, sizeof(*event));
   6056     srpl_event_configuration_t *event_config = srpl_event_configuration_get(event_type);
   6057     if (event_config == NULL) {
   6058         ERROR("invalid event type %d", event_type);
   6059         STATE_DEBUGGING_ABORT();
   6060         return;
   6061     }
   6062     event->event_type = event_type;
   6063     event->name = event_config->name;
   6064 }
   6065 
   6066 static void
   6067 srpl_event_deliver(srpl_connection_t *srpl_connection, srpl_event_t *event)
   6068 {
   6069     srpl_connection_state_t *state = srpl_state_get(srpl_connection->state);
   6070     if (state == NULL) {
   6071         ERROR(PRI_S_SRP ": event " PUB_S_SRP " received in invalid state %d",
   6072               srpl_connection->name, event->name, srpl_connection->state);
   6073         STATE_DEBUGGING_ABORT();
   6074         return;
   6075     }
   6076     if (state->action == NULL) {
   6077         FAULT(PRI_S_SRP": event " PUB_S_SRP " received in state " PUB_S_SRP " with NULL action",
   6078               srpl_connection->name, event->name, state->name);
   6079         return;
   6080     }
   6081     srpl_state_t next_state = state->action(srpl_connection, event);
   6082     if (next_state != srpl_state_invalid) {
   6083         srpl_connection_next_state(srpl_connection, next_state);
   6084     }
   6085 }
   6086 
   6087 static void
   6088 srpl_re_register(void *context)
   6089 {
   6090     INFO("re-registering SRPL service");
   6091     srpl_domain_advertise(context);
   6092 }
   6093 
   6094 static void
   6095 srpl_register_completion(DNSServiceRef UNUSED sdref, DNSServiceFlags UNUSED flags, DNSServiceErrorType error_code,
   6096                          const char *name, const char *regtype, const char *domain, void *context)
   6097 {
   6098     srpl_domain_t *srpl_domain = context;
   6099 
   6100     if (error_code != kDNSServiceErr_NoError) {
   6101         ERROR("unable to advertise _srpl-tls._tcp service: %d", error_code);
   6102         if (srpl_domain->srpl_register_wakeup == NULL) {
   6103             srpl_domain->srpl_register_wakeup = ioloop_wakeup_create();
   6104         }
   6105         if (srpl_domain->srpl_register_wakeup != NULL) {
   6106             // Try registering again in one second.
   6107             ioloop_add_wake_event(srpl_domain->srpl_register_wakeup, srpl_domain, srpl_re_register, srpl_domain_context_release, 1000);
   6108             RETAIN_HERE(srpl_domain, srpl_domain);
   6109         }
   6110         return;
   6111     }
   6112     INFO("registered SRP Replication instance name " PRI_S_SRP "." PUB_S_SRP "." PRI_S_SRP, name, regtype, domain);
   6113 }
   6114 
   6115 static void
   6116 srpl_domain_advertise(srpl_domain_t *domain)
   6117 {
   6118     DNSServiceRef sdref = NULL;
   6119     TXTRecordRef txt_record;
   6120     char partner_id_buf[INT64_HEX_STRING_MAX];
   6121     char dataset_id_buf[INT64_HEX_STRING_MAX];
   6122     char xpanid_buf[INT64_HEX_STRING_MAX];
   6123     char priority_buf[INT64_HEX_STRING_MAX];
   6124     srp_server_t *server_state = domain->server_state;
   6125 
   6126     if (domain->srpl_opstate != SRPL_OPSTATE_ROUTINE) {
   6127         INFO(PUB_S_SRP ": not in routine state", domain->name);
   6128         return;
   6129     }
   6130 
   6131     TXTRecordCreate(&txt_record, 0, NULL);
   6132 
   6133     int err = TXTRecordSetValue(&txt_record, "dn", strlen(server_state->current_thread_domain_name), domain->name);
   6134     if (err != kDNSServiceErr_NoError) {
   6135         ERROR("unable to set domain in TXT record for _srpl-tls._tcp to " PRI_S_SRP, domain->name);
   6136         goto exit;
   6137     }
   6138 
   6139     snprintf(partner_id_buf, sizeof(partner_id_buf), "%" PRIx64, domain->partner_id);
   6140     err = TXTRecordSetValue(&txt_record, "pid", strlen(partner_id_buf), partner_id_buf);
   6141     if (err != kDNSServiceErr_NoError) {
   6142         ERROR("unable to set partner-id in TXT record for _srpl-tls._tcp to " PUB_S_SRP, partner_id_buf);
   6143         goto exit;
   6144     }
   6145 
   6146     snprintf(dataset_id_buf, sizeof(dataset_id_buf), "%" PRIx64, domain->dataset_id);
   6147     err = TXTRecordSetValue(&txt_record, "did", strlen(dataset_id_buf), dataset_id_buf);
   6148     if (err != kDNSServiceErr_NoError) {
   6149         ERROR("unable to set dataset-id in TXT record for _srpl-tls._tcp to " PUB_S_SRP, dataset_id_buf);
   6150         goto exit;
   6151     }
   6152 
   6153     snprintf(xpanid_buf, sizeof(xpanid_buf), "%" PRIx64, domain->server_state->xpanid);
   6154     err = TXTRecordSetValue(&txt_record, "xpanid", strlen(xpanid_buf), xpanid_buf);
   6155     if (err != kDNSServiceErr_NoError) {
   6156         ERROR("unable to set xpanid in TXT record for _srpl-tls._tcp to " PUB_S_SRP, dataset_id_buf);
   6157         goto exit;
   6158     }
   6159 
   6160     snprintf(priority_buf, sizeof(priority_buf), "%" PRIx32, domain->server_state->priority);
   6161     err = TXTRecordSetValue(&txt_record, "priority", strlen(priority_buf), priority_buf);
   6162     if (err != kDNSServiceErr_NoError) {
   6163         ERROR("unable to set priority in TXT record for _srpl-tls._tcp to " PUB_S_SRP, priority_buf);
   6164         goto exit;
   6165     }
   6166 
   6167     // If there is already a registration, get rid of it
   6168     if (domain->srpl_advertise_txn != NULL) {
   6169         ioloop_dnssd_txn_cancel(domain->srpl_advertise_txn);
   6170         ioloop_dnssd_txn_release(domain->srpl_advertise_txn);
   6171         domain->srpl_advertise_txn = NULL;
   6172     }
   6173 
   6174     err = DNSServiceRegister(&sdref, kDNSServiceFlagsUnique,
   6175                              kDNSServiceInterfaceIndexAny, NULL, "_srpl-tls._tcp", NULL,
   6176                              NULL, htons(853), TXTRecordGetLength(&txt_record), TXTRecordGetBytesPtr(&txt_record),
   6177                              srpl_register_completion, domain);
   6178     if (err != kDNSServiceErr_NoError) {
   6179         ERROR("unable to advertise _srpl-tls._tcp service");
   6180         goto exit;
   6181     }
   6182     domain->srpl_advertise_txn = ioloop_dnssd_txn_add(sdref, NULL, NULL, NULL);
   6183     if (domain->srpl_advertise_txn == NULL) {
   6184         ERROR("unable to set up a dnssd_txn_t for _srpl-tls._tcp advertisement.");
   6185         goto exit;
   6186     }
   6187     sdref = NULL; // srpl_advertise_txn holds the reference.
   6188     INFO(PUB_S_SRP ": successfully advertised", domain->name);
   6189 
   6190 exit:
   6191     if (sdref != NULL) {
   6192         DNSServiceRefDeallocate(sdref);
   6193     }
   6194     TXTRecordDeallocate(&txt_record);
   6195     return;
   6196 }
   6197 
   6198 static void
   6199 srpl_thread_network_name_callback(void *NULLABLE context, const char *NULLABLE thread_network_name, cti_status_t status)
   6200 {
   6201     size_t thread_domain_size;
   6202     char domain_buf[kDNSServiceMaxDomainName];
   6203     char *new_thread_domain_name;
   6204     srp_server_t *server_state = context;
   6205 
   6206     if (thread_network_name == NULL || status != kCTIStatus_NoError) {
   6207         ERROR("unable to get thread network name.");
   6208         return;
   6209     }
   6210     thread_domain_size = snprintf(domain_buf, sizeof(domain_buf),
   6211                                   "%s.%s", thread_network_name, SRP_THREAD_DOMAIN);
   6212     if (thread_domain_size < 0 || thread_domain_size >= sizeof (domain_buf) ||
   6213         (new_thread_domain_name = strdup(domain_buf)) == NULL)
   6214     {
   6215         ERROR("no memory for new thread network name: " PRI_S_SRP, thread_network_name);
   6216         return;
   6217     }
   6218 
   6219     if (server_state->current_thread_domain_name != NULL) {
   6220         srpl_domain_rename(server_state->current_thread_domain_name, new_thread_domain_name);
   6221     }
   6222     srpl_domain_add(server_state, new_thread_domain_name);
   6223     free(server_state->current_thread_domain_name);
   6224     server_state->current_thread_domain_name = new_thread_domain_name;
   6225 }
   6226 
   6227 // If the partner does not discover any other SRPL partners to synchronize with,
   6228 // or it has synchronized with all the partners discovered so far, it transitions
   6229 // out of the "startup" state to the "routine operation" state.
   6230 static void
   6231 srpl_partner_discovery_timeout(void *context)
   6232 {
   6233     srpl_domain_t *domain = context;
   6234 
   6235     INFO("partner discovery timeout.");
   6236 
   6237     domain->partner_discovery_pending = false;
   6238     if (domain->partner_discovery_timeout != NULL) {
   6239         ioloop_cancel_wake_event(domain->partner_discovery_timeout);
   6240         ioloop_wakeup_release(domain->partner_discovery_timeout);
   6241         domain->partner_discovery_timeout = NULL;
   6242     }
   6243     srpl_maybe_sync_or_transition(domain);
   6244 }
   6245 
   6246 // count how many partners are advertising the proposed dataset id
   6247 static int
   6248 srpl_active_winning_partners(srpl_domain_t *domain, int *rank)
   6249 {
   6250     int num_winners = 0;
   6251     int my_rank = 0;
   6252     if (domain->have_dataset_id) {
   6253         uint64_t proposed_dataset_id = domain->dataset_id;
   6254         uint32_t my_priority = domain->server_state->priority;
   6255         // winning partners are those with higher priority, or same priority but
   6256         // lower partner id, and advertising the proposed dataset id.
   6257         for (srpl_instance_t *instance = domain->instances; instance != NULL; instance = instance->next) {
   6258             if (instance->connection != NULL &&
   6259                 instance->connection->state > srpl_state_session_evaluate &&
   6260                 instance->dataset_id == proposed_dataset_id &&
   6261                 instance->priority >= my_priority)
   6262             {
   6263                 num_winners++;
   6264                 if (instance->priority > my_priority || (instance->priority == my_priority &&
   6265                     instance->partner_id < domain->partner_id))
   6266                 {
   6267                     my_rank++;
   6268                 }
   6269             }
   6270         }
   6271     }
   6272     *rank = my_rank;
   6273     return num_winners;
   6274 }
   6275 
   6276 static void
   6277 srpl_sync_with_instance(srpl_instance_t *instance)
   6278 {
   6279     INFO("sync with " PRI_S_SRP " with dataset_id %" PRIx64, instance->instance_name, instance->dataset_id);
   6280     if (instance->discovered_in_window) {
   6281         instance->sync_to_join = true;
   6282     }
   6283     instance->sync_fail = false;
   6284     srpl_event_t event;
   6285     srpl_event_initialize(&event, srpl_event_do_sync);
   6286     srpl_event_deliver(instance->connection, &event);
   6287 }
   6288 
   6289 // We check how many active srp servers we discovered. If less than 5, we first
   6290 // check if we are ready to enter the routine state. If there are still srp servers
   6291 // that we haven't started wo sync with, we do so.
   6292 static void
   6293 srpl_maybe_sync_or_transition(srpl_domain_t *domain)
   6294 {
   6295     int num_winners = 0;
   6296     int rank = 0;
   6297     // if we haven't committed a dataset id, here we check if we
   6298     // should propose a new one. we propose a new dataset id if
   6299     // sync with the instances of the proposed dataset id all fail, or
   6300     // there's no partner advertising the proposed dataset id.
   6301     if (!domain->dataset_id_committed) {
   6302         srpl_maybe_propose_new_dataset_id(domain);
   6303     }
   6304     num_winners = srpl_active_winning_partners(domain, &rank);
   6305 
   6306     if (num_winners < MAX_ANYCAST_NUM) {
   6307         INFO("%d other srp servers are advertising.", num_winners);
   6308         for (srpl_instance_t *instance = domain->instances; instance != NULL; instance = instance->next) {
   6309             // we sync with the instances with the same dataset id
   6310             if (instance->connection != NULL &&
   6311                 instance->connection->state == srpl_state_sync_wait &&
   6312                 srpl_dataset_id_compare(instance->dataset_id, domain->dataset_id) >= 0)
   6313             {
   6314                 srpl_sync_with_instance(instance);
   6315             }
   6316         }
   6317         if (domain->srpl_opstate != SRPL_OPSTATE_ROUTINE &&
   6318             srpl_can_transition_to_routine_state(domain))
   6319         {
   6320             srpl_transition_to_routine_state(domain);
   6321         }
   6322     } else {
   6323         // MAX_ANYCAST_NUM or more better qualified servers are advertising. if
   6324         // we are in the routine state and advertising, we should withdraw and
   6325         // reenter the startup state.
   6326         INFO("%d other srp servers are advertising, rank = %d.", num_winners, rank);
   6327         if (domain->srpl_opstate == SRPL_OPSTATE_ROUTINE &&
   6328             rank >= MAX_ANYCAST_NUM)
   6329         {
   6330             INFO("transition to startup state.");
   6331             srpl_transition_to_startup_state(domain);
   6332         }
   6333     }
   6334 }
   6335 
   6336 static void
   6337 srpl_transition_to_startup_state(srpl_domain_t *domain)
   6338 {
   6339     srp_server_t *server_state = domain->server_state;
   6340 
   6341     // stop advertising the domain.
   6342     srpl_stop_domain_advertisement(domain);
   6343     // move to "startup" state.
   6344     domain->srpl_opstate = SRPL_OPSTATE_STARTUP;
   6345 
   6346     if (server_state == NULL) {
   6347         ERROR("server state is NULL.");
   6348         return;
   6349     }
   6350 
   6351 #if STUB_ROUTER
   6352     route_state_t *route_state = NULL;
   6353     route_state = server_state->route_state;
   6354     if (route_state != NULL && !strcmp(domain->name, server_state->current_thread_domain_name)) {
   6355         route_state->partition_can_advertise_anycast_service = false;
   6356         partition_stop_advertising_anycast_service(route_state, route_state->thread_sequence_number);
   6357     }
   6358 #endif
   6359     if (domain->partner_discovery_timeout == NULL) {
   6360         domain->partner_discovery_timeout = ioloop_wakeup_create();
   6361     }
   6362     if (domain->partner_discovery_timeout) {
   6363         ioloop_add_wake_event(domain->partner_discovery_timeout, domain,
   6364                               srpl_partner_discovery_timeout, srpl_domain_context_release,
   6365                               MIN_PARTNER_DISCOVERY_INTERVAL +
   6366                               srp_random16() % PARTNER_DISCOVERY_INTERVAL_RANGE);
   6367         RETAIN_HERE(domain, srpl_domain);
   6368     } else {
   6369         ERROR("unable to add wakeup event for partner discovery.");
   6370         return;
   6371     }
   6372     domain->partner_discovery_pending = true;
   6373 }
   6374 
   6375 void
   6376 srpl_startup(srp_server_t *server_state)
   6377 {
   6378     cti_get_thread_network_name(server_state, srpl_thread_network_name_callback, NULL);
   6379 }
   6380 #endif // SRP_FEATURE_REPLICATION
   6381 
   6382 // Local Variables:
   6383 // mode: C
   6384 // tab-width: 4
   6385 // c-file-style: "bsd"
   6386 // c-basic-offset: 4
   6387 // fill-column: 120
   6388 // indent-tabs-mode: nil
   6389 // End:
   6390