Skip to content

Commit

Permalink
Fix logic issues during graceful shutdown (#171)
Browse files Browse the repository at this point in the history
* Fix logic issues during graceful shutdown

# Motivation

This should fix the remaining issues raised in #166. The problem here was that if a service finished/threw out of order then we were wrongly treating this as if the service that we are currently shutting down finished.

# Modification
This PR ensures that we use the same `services` array during the graceful shutdown to nil out services that have finished. This way we correctly keep track of any service that finished. Additionally, there was a separate bug where we started to shutdown the next service to early if another service threw and had the termination behaviour of `shutdownGracefully`.

# Result
No more incorrect shutdown orderings.

* Fix one more race condition
  • Loading branch information
FranzBusch authored Jan 26, 2024
1 parent b21c43a commit cdd6040
Show file tree
Hide file tree
Showing 2 changed files with 336 additions and 29 deletions.
85 changes: 57 additions & 28 deletions Sources/ServiceLifecycle/ServiceGroup.swift
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ public actor ServiceGroup: Sendable {
services[index] = nil
do {
try await self.shutdownGracefully(
services: services,
services: &services,
cancellationTimeoutTask: &cancellationTimeoutTask,
group: &group,
gracefulShutdownManagers: gracefulShutdownManagers
Expand Down Expand Up @@ -380,7 +380,7 @@ public actor ServiceGroup: Sendable {

do {
try await self.shutdownGracefully(
services: services,
services: &services,
cancellationTimeoutTask: &cancellationTimeoutTask,
group: &group,
gracefulShutdownManagers: gracefulShutdownManagers
Expand Down Expand Up @@ -421,7 +421,7 @@ public actor ServiceGroup: Sendable {
)
do {
try await self.shutdownGracefully(
services: services,
services: &services,
cancellationTimeoutTask: &cancellationTimeoutTask,
group: &group,
gracefulShutdownManagers: gracefulShutdownManagers
Expand All @@ -448,7 +448,7 @@ public actor ServiceGroup: Sendable {

do {
try await self.shutdownGracefully(
services: services,
services: &services,
cancellationTimeoutTask: &cancellationTimeoutTask,
group: &group,
gracefulShutdownManagers: gracefulShutdownManagers
Expand Down Expand Up @@ -489,7 +489,7 @@ public actor ServiceGroup: Sendable {
}

private func shutdownGracefully(
services: [ServiceGroupConfiguration.ServiceConfiguration?],
services: inout [ServiceGroupConfiguration.ServiceConfiguration?],
cancellationTimeoutTask: inout Task<Void, Never>?,
group: inout ThrowingTaskGroup<ChildTaskResult, Error>,
gracefulShutdownManagers: [GracefulShutdownManager]
Expand Down Expand Up @@ -519,7 +519,7 @@ public actor ServiceGroup: Sendable {
self.logger.debug(
"Service already finished. Skipping shutdown"
)
continue
continue gracefulShutdownLoop
}
self.logger.debug(
"Triggering graceful shutdown for service",
Expand All @@ -533,6 +533,7 @@ public actor ServiceGroup: Sendable {
while let result = try await group.next() {
switch result {
case .serviceFinished(let service, let index):
services[index] = nil
if group.isCancelled {
// The group is cancelled and we expect all services to finish
continue gracefulShutdownLoop
Expand Down Expand Up @@ -561,7 +562,8 @@ public actor ServiceGroup: Sendable {
throw ServiceGroupError.serviceFinishedUnexpectedly()
}

case .serviceThrew(let service, _, let serviceError):
case .serviceThrew(let service, let index, let serviceError):
services[index] = nil
switch service.failureTerminationBehavior.behavior {
case .cancelGroup:
self.logger.debug(
Expand All @@ -575,32 +577,58 @@ public actor ServiceGroup: Sendable {
throw serviceError

case .gracefullyShutdownGroup:
self.logger.debug(
"Service threw error during graceful shutdown.",
metadata: [
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
]
)

if error == nil {
error = serviceError
}

// We can continue shutting down the next service now
continue gracefulShutdownLoop
if index == gracefulShutdownIndex {
// The service that we were shutting down right now threw. Since it's failure
// behaviour is to shutdown the group we can continue
self.logger.debug(
"The service that we were shutting down threw. Continuing with the next one.",
metadata: [
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
]
)
continue gracefulShutdownLoop
} else {
// Another service threw while we were waiting for a shutdown
// We have to continue the iterating the task group's result
self.logger.debug(
"Another service than the service that we were shutting down threw. Continuing with the next one.",
metadata: [
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
]
)
break
}

case .ignore:
self.logger.debug(
"Service threw error during graceful shutdown.",
metadata: [
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
]
)

// We can continue shutting down the next service now
continue gracefulShutdownLoop
if index == gracefulShutdownIndex {
// The service that we were shutting down right now threw. Since it's failure
// behaviour is to shutdown the group we can continue
self.logger.debug(
"The service that we were shutting down threw. Continuing with the next one.",
metadata: [
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
]
)
continue gracefulShutdownLoop
} else {
// Another service threw while we were waiting for a shutdown
// We have to continue the iterating the task group's result
self.logger.debug(
"Another service than the service that we were shutting down threw. Continuing with the next one.",
metadata: [
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
]
)
break
}
}

case .signalCaught(let signal):
Expand Down Expand Up @@ -635,7 +663,8 @@ public actor ServiceGroup: Sendable {

case .signalSequenceFinished, .gracefulShutdownCaught, .gracefulShutdownFinished:
// We just have to tolerate this since signals and parent graceful shutdowns downs can race.
// We are going to continue the
// We are going to continue the result loop since we have to wait for our service
// to finish.
break
}
}
Expand Down
Loading

0 comments on commit cdd6040

Please sign in to comment.