Skip to content

Commit

Permalink
refactor(iroh): move code from builder to node and make things nicer (#…
Browse files Browse the repository at this point in the history
…2386)

## Description

This is a cleanup of the node and builder code.

* Move `run` and `gc_loop` from `builder.rs` to `node.rs` - it is not
about building but about running.
* Improve the code flow and naming all around the builder, spawn and run
functions
* Make sure we shutdown the node correctly while spawning

## Breaking Changes

* Removed `Node::controller`. Use `Node::client` instead. The former was
mostly unusable anyway because we made the RPC structs private.

## Notes & open questions

<!-- Any notes, remarks or open questions you have to make about the PR.
-->

## Change checklist

- [x] Self-review.
- [x] Documentation updates if relevant.
- [ ] ~~Tests if relevant.~~
- [x] All breaking changes documented.
  • Loading branch information
Frando authored Jun 19, 2024
1 parent 13ded84 commit 08f1fe0
Show file tree
Hide file tree
Showing 3 changed files with 612 additions and 613 deletions.
288 changes: 259 additions & 29 deletions iroh/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,26 @@
//! A node is a server that serves various protocols.
//!
//! To shut down the node, call [`Node::shutdown`].
use std::fmt::Debug;
use std::net::SocketAddr;
use std::path::Path;
use std::sync::Arc;
use std::{collections::BTreeSet, net::SocketAddr};
use std::{fmt::Debug, time::Duration};

use anyhow::{anyhow, Result};
use futures_lite::StreamExt;
use iroh_base::key::PublicKey;
use iroh_blobs::downloader::Downloader;
use iroh_blobs::store::Store as BaoStore;
use iroh_blobs::store::{GcMarkEvent, GcSweepEvent, Store as BaoStore};
use iroh_blobs::{downloader::Downloader, protocol::Closed};
use iroh_docs::engine::Engine;
use iroh_gossip::net::Gossip;
use iroh_net::key::SecretKey;
use iroh_net::Endpoint;
use iroh_net::{endpoint::DirectAddrsStream, util::SharedAbortingJoinHandle};
use quic_rpc::transport::flume::FlumeConnection;
use quic_rpc::RpcClient;
use quic_rpc::{RpcServer, ServiceEndpoint};
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tokio_util::task::LocalPoolHandle;
use tracing::debug;
use tracing::{debug, error, info, warn};

use crate::{client::RpcService, node::protocol::ProtocolMap};

Expand All @@ -48,20 +48,19 @@ pub use protocol::ProtocolHandler;
#[derive(Debug, Clone)]
pub struct Node<D> {
inner: Arc<NodeInner<D>>,
client: crate::client::MemIroh,
task: SharedAbortingJoinHandle<()>,
protocols: Arc<ProtocolMap>,
}

#[derive(derive_more::Debug)]
struct NodeInner<D> {
db: D,
sync: DocsEngine,
docs: DocsEngine,
endpoint: Endpoint,
gossip: Gossip,
secret_key: SecretKey,
cancel_token: CancellationToken,
controller: FlumeConnection<RpcService>,
client: crate::client::MemIroh,
#[debug("rt")]
rt: LocalPoolHandle,
downloader: Downloader,
Expand Down Expand Up @@ -133,14 +132,9 @@ impl<D: BaoStore> Node<D> {
self.inner.secret_key.public()
}

/// Returns a handle that can be used to do RPC calls to the node internally.
pub fn controller(&self) -> crate::client::MemRpcClient {
RpcClient::new(self.inner.controller.clone())
}

/// Return a client to control this node over an in-memory channel.
pub fn client(&self) -> &crate::client::MemIroh {
&self.client
&self.inner.client
}

/// Returns a referenc to the used `LocalPoolHandle`.
Expand Down Expand Up @@ -189,11 +183,11 @@ impl<D> std::ops::Deref for Node<D> {
type Target = crate::client::MemIroh;

fn deref(&self) -> &Self::Target {
&self.client
&self.inner.client
}
}

impl<D> NodeInner<D> {
impl<D: iroh_blobs::store::Store> NodeInner<D> {
async fn local_endpoint_addresses(&self) -> Result<Vec<SocketAddr>> {
let endpoints = self
.endpoint
Expand All @@ -203,6 +197,243 @@ impl<D> NodeInner<D> {
.ok_or(anyhow!("no endpoints found"))?;
Ok(endpoints.into_iter().map(|x| x.addr).collect())
}

async fn run(
self: Arc<Self>,
external_rpc: impl ServiceEndpoint<RpcService>,
internal_rpc: impl ServiceEndpoint<RpcService>,
protocols: Arc<ProtocolMap>,
gc_policy: GcPolicy,
gc_done_callback: Option<Box<dyn Fn() + Send>>,
) {
let (ipv4, ipv6) = self.endpoint.bound_sockets();
debug!(
"listening at: {}{}",
ipv4,
ipv6.map(|addr| format!(" and {addr}")).unwrap_or_default()
);
debug!("rpc listening at: {:?}", external_rpc.local_addr());

let mut join_set = JoinSet::new();

// Setup the RPC servers.
let external_rpc = RpcServer::new(external_rpc);
let internal_rpc = RpcServer::new(internal_rpc);

// TODO(frando): I think this is not needed as we do the same in a task just below.
// forward the initial endpoints to the gossip protocol.
// it may happen the the first endpoint update callback is missed because the gossip cell
// is only initialized once the endpoint is fully bound
if let Some(direct_addresses) = self.endpoint.direct_addresses().next().await {
debug!(me = ?self.endpoint.node_id(), "gossip initial update: {direct_addresses:?}");
self.gossip.update_direct_addresses(&direct_addresses).ok();
}

// Spawn a task for the garbage collection.
if let GcPolicy::Interval(gc_period) = gc_policy {
let inner = self.clone();
let handle = self
.rt
.spawn_pinned(move || inner.run_gc_loop(gc_period, gc_done_callback));
// We cannot spawn tasks that run on the local pool directly into the join set,
// so instead we create a new task that supervises the local task.
join_set.spawn({
async move {
if let Err(err) = handle.await {
return Err(anyhow::Error::from(err));
}
Ok(())
}
});
}

// Spawn a task that updates the gossip endpoints.
let inner = self.clone();
join_set.spawn(async move {
let mut stream = inner.endpoint.direct_addresses();
while let Some(eps) = stream.next().await {
if let Err(err) = inner.gossip.update_direct_addresses(&eps) {
warn!("Failed to update direct addresses for gossip: {err:?}");
}
}
warn!("failed to retrieve local endpoints");
Ok(())
});

loop {
tokio::select! {
biased;
_ = self.cancel_token.cancelled() => {
break;
},
// handle rpc requests. This will do nothing if rpc is not configured, since
// accept is just a pending future.
request = external_rpc.accept() => {
match request {
Ok((msg, chan)) => {
rpc::Handler::spawn_rpc_request(self.clone(), &mut join_set, msg, chan);
}
Err(e) => {
info!("rpc request error: {:?}", e);
}
}
},
// handle internal rpc requests.
request = internal_rpc.accept() => {
match request {
Ok((msg, chan)) => {
rpc::Handler::spawn_rpc_request(self.clone(), &mut join_set, msg, chan);
}
Err(e) => {
info!("internal rpc request error: {:?}", e);
}
}
},
// handle incoming p2p connections.
Some(connecting) = self.endpoint.accept() => {
let protocols = protocols.clone();
join_set.spawn(async move {
handle_connection(connecting, protocols).await;
Ok(())
});
},
// handle task terminations and quit on panics.
res = join_set.join_next(), if !join_set.is_empty() => {
if let Some(Err(err)) = res {
error!("Task failed: {err:?}");
break;
}
},
else => break,
}
}

self.shutdown(protocols).await;

// Abort remaining tasks.
join_set.shutdown().await;
}

async fn shutdown(&self, protocols: Arc<ProtocolMap>) {
// Shutdown the different parts of the node concurrently.
let error_code = Closed::ProviderTerminating;
// We ignore all errors during shutdown.
let _ = tokio::join!(
// Close the endpoint.
// Closing the Endpoint is the equivalent of calling Connection::close on all
// connections: Operations will immediately fail with ConnectionError::LocallyClosed.
// All streams are interrupted, this is not graceful.
self.endpoint
.clone()
.close(error_code.into(), error_code.reason()),
// Shutdown sync engine.
self.docs.shutdown(),
// Shutdown blobs store engine.
self.db.shutdown(),
// Shutdown protocol handlers.
protocols.shutdown(),
);
}

async fn run_gc_loop(
self: Arc<Self>,
gc_period: Duration,
done_cb: Option<Box<dyn Fn() + Send>>,
) {
tracing::info!("Starting GC task with interval {:?}", gc_period);
let db = &self.db;
let docs = &self.docs;
let mut live = BTreeSet::new();
'outer: loop {
if let Err(cause) = db.gc_start().await {
tracing::debug!(
"unable to notify the db of GC start: {cause}. Shutting down GC loop."
);
break;
}
// do delay before the two phases of GC
tokio::time::sleep(gc_period).await;
tracing::debug!("Starting GC");
live.clear();

let doc_hashes = match docs.sync.content_hashes().await {
Ok(hashes) => hashes,
Err(err) => {
tracing::warn!("Error getting doc hashes: {}", err);
continue 'outer;
}
};
for hash in doc_hashes {
match hash {
Ok(hash) => {
live.insert(hash);
}
Err(err) => {
tracing::error!("Error getting doc hash: {}", err);
continue 'outer;
}
}
}

tracing::debug!("Starting GC mark phase");
let mut stream = db.gc_mark(&mut live);
while let Some(item) = stream.next().await {
match item {
GcMarkEvent::CustomDebug(text) => {
tracing::debug!("{}", text);
}
GcMarkEvent::CustomWarning(text, _) => {
tracing::warn!("{}", text);
}
GcMarkEvent::Error(err) => {
tracing::error!("Fatal error during GC mark {}", err);
continue 'outer;
}
}
}
drop(stream);

tracing::debug!("Starting GC sweep phase");
let mut stream = db.gc_sweep(&live);
while let Some(item) = stream.next().await {
match item {
GcSweepEvent::CustomDebug(text) => {
tracing::debug!("{}", text);
}
GcSweepEvent::CustomWarning(text, _) => {
tracing::warn!("{}", text);
}
GcSweepEvent::Error(err) => {
tracing::error!("Fatal error during GC mark {}", err);
continue 'outer;
}
}
}
if let Some(ref cb) = done_cb {
cb();
}
}
}
}

async fn handle_connection(
mut connecting: iroh_net::endpoint::Connecting,
protocols: Arc<ProtocolMap>,
) {
let alpn = match connecting.alpn().await {
Ok(alpn) => alpn,
Err(err) => {
warn!("Ignoring connection: invalid handshake: {:?}", err);
return;
}
};
let Some(handler) = protocols.get(&alpn) else {
warn!("Ignoring connection: unsupported ALPN protocol");
return;
};
if let Err(err) = handler.accept(connecting).await {
warn!("Handling incoming connection ended with error: {err}");
}
}

/// Wrapper around [`Engine`] so that we can implement our RPC methods directly.
Expand All @@ -228,7 +459,7 @@ mod tests {

use crate::{
client::blobs::{AddOutcome, WrapOption},
rpc_protocol::{BlobAddPathRequest, BlobAddPathResponse, SetTagOption},
rpc_protocol::SetTagOption,
};

use super::*;
Expand Down Expand Up @@ -289,18 +520,17 @@ mod tests {

let _got_hash = tokio::time::timeout(Duration::from_secs(1), async move {
let mut stream = node
.controller()
.server_streaming(BlobAddPathRequest {
path: Path::new(env!("CARGO_MANIFEST_DIR")).join("README.md"),
in_place: false,
tag: SetTagOption::Auto,
wrap: WrapOption::NoWrap,
})
.blobs()
.add_from_path(
Path::new(env!("CARGO_MANIFEST_DIR")).join("README.md"),
false,
SetTagOption::Auto,
WrapOption::NoWrap,
)
.await?;

while let Some(item) = stream.next().await {
let BlobAddPathResponse(progress) = item?;
match progress {
while let Some(progress) = stream.next().await {
match progress? {
AddProgress::AllDone { hash, .. } => {
return Ok(hash);
}
Expand Down
Loading

0 comments on commit 08f1fe0

Please sign in to comment.