Skip to content

Commit

Permalink
Set min nodes to 0 for worker and user. (#2168)
Browse files Browse the repository at this point in the history
Co-authored-by: Prashant Tiwari <prashant.tiwari@genesisoilandgas.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Christopher Ostrouchov <chris.ostrouchov@gmail.com>
Co-authored-by: Chuck McAndrew <6248903+dcmcand@users.noreply.github.com>
  • Loading branch information
5 people authored Feb 10, 2024
1 parent c343d36 commit 0d1a30d
Show file tree
Hide file tree
Showing 8 changed files with 96 additions and 8 deletions.
40 changes: 40 additions & 0 deletions src/_nebari/provider/cloud/amazon_web_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,46 @@ def aws_get_vpc_id(name: str, namespace: str, region: str) -> Optional[str]:
return None


def set_asg_tags(asg_node_group_map: Dict[str, str], region: str) -> None:
"""Set tags for AWS node scaling from zero to work."""
session = aws_session(region=region)
autoscaling_client = session.client("autoscaling")
tags = []
for asg_name, node_group in asg_node_group_map.items():
tags.append(
{
"Key": "k8s.io/cluster-autoscaler/node-template/label/dedicated",
"Value": node_group,
"ResourceId": asg_name,
"ResourceType": "auto-scaling-group",
"PropagateAtLaunch": True,
}
)
autoscaling_client.create_or_update_tags(Tags=tags)


def aws_get_asg_node_group_mapping(
name: str, namespace: str, region: str
) -> Dict[str, str]:
"""Return a dictionary of autoscaling groups and their associated node groups."""
asg_node_group_mapping = {}
session = aws_session(region=region)
eks = session.client("eks")
node_groups_response = eks.list_nodegroups(
clusterName=f"{name}-{namespace}",
)
node_groups = node_groups_response.get("nodegroups", [])
for nodegroup in node_groups:
response = eks.describe_nodegroup(
clusterName=f"{name}-{namespace}", nodegroupName=nodegroup
)
node_group_name = response["nodegroup"]["nodegroupName"]
auto_scaling_groups = response["nodegroup"]["resources"]["autoScalingGroups"]
for auto_scaling_group in auto_scaling_groups:
asg_node_group_mapping[auto_scaling_group["name"]] = node_group_name
return asg_node_group_mapping


def aws_get_subnet_ids(name: str, namespace: str, region: str) -> List[str]:
"""Return list of subnet IDs for the EKS cluster named `{name}-{namespace}`."""
session = aws_session(region=region)
Expand Down
25 changes: 23 additions & 2 deletions src/_nebari/stages/infrastructure/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,17 @@ class AWSInputVars(schema.Base):
tags: Dict[str, str] = {}


def _calculate_asg_node_group_map(config: schema.Main):
if config.provider == schema.ProviderEnum.aws:
return amazon_web_services.aws_get_asg_node_group_mapping(
config.project_name,
config.namespace,
config.amazon_web_services.region,
)
else:
return {}


def _calculate_node_groups(config: schema.Main):
if config.provider == schema.ProviderEnum.aws:
return {
Expand Down Expand Up @@ -438,10 +449,10 @@ class AmazonWebServicesProvider(schema.Base):
node_groups: Dict[str, AWSNodeGroup] = {
"general": AWSNodeGroup(instance="m5.2xlarge", min_nodes=1, max_nodes=1),
"user": AWSNodeGroup(
instance="m5.xlarge", min_nodes=1, max_nodes=5, single_subnet=False
instance="m5.xlarge", min_nodes=0, max_nodes=5, single_subnet=False
),
"worker": AWSNodeGroup(
instance="m5.xlarge", min_nodes=1, max_nodes=5, single_subnet=False
instance="m5.xlarge", min_nodes=0, max_nodes=5, single_subnet=False
),
}
existing_subnet_ids: List[str] = None
Expand Down Expand Up @@ -814,6 +825,16 @@ def set_outputs(
outputs["node_selectors"] = _calculate_node_groups(self.config)
super().set_outputs(stage_outputs, outputs)

@contextlib.contextmanager
def post_deploy(
self, stage_outputs: Dict[str, Dict[str, Any]], disable_prompt: bool = False
):
asg_node_group_map = _calculate_asg_node_group_map(self.config)
if asg_node_group_map:
amazon_web_services.set_asg_tags(
asg_node_group_map, self.config.amazon_web_services.region
)

@contextlib.contextmanager
def deploy(
self, stage_outputs: Dict[str, Dict[str, Any]], disable_prompt: bool = False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ resource "aws_eks_node_group" "main" {
max_size = var.node_groups[count.index].max_size
}

labels = {
"dedicated" = var.node_groups[count.index].name
}

lifecycle {
ignore_changes = [
scaling_config[0].desired_size,
Expand All @@ -53,7 +57,9 @@ resource "aws_eks_node_group" "main" {
]

tags = merge({
"kubernetes.io/cluster/${var.name}" = "shared"
# "kubernetes.io/cluster/${var.name}" = "shared"
"k8s.io/cluster-autoscaler/node-template/label/dedicated" = var.node_groups[count.index].name
propagate_at_launch = true
}, var.tags)
}

Expand Down
3 changes: 3 additions & 0 deletions src/_nebari/stages/kubernetes_services/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ class JupyterhubInputVars(schema.Base):
idle_culler_settings: Dict[str, Any] = Field(alias="idle-culler-settings")
argo_workflows_enabled: bool = Field(alias="argo-workflows-enabled")
jhub_apps_enabled: bool = Field(alias="jhub-apps-enabled")
cloud_provider: str = Field(alias="cloud-provider")


class DaskGatewayInputVars(schema.Base):
Expand Down Expand Up @@ -411,6 +412,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]):
realm_id = stage_outputs["stages/06-kubernetes-keycloak-configuration"][
"realm_id"
]["value"]
cloud_provider = self.config.provider.value
jupyterhub_shared_endpoint = (
stage_outputs["stages/02-infrastructure"]
.get("nfs_endpoint", {})
Expand Down Expand Up @@ -486,6 +488,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]):
),
jupyterhub_stared_storage=self.config.storage.shared_filesystem,
jupyterhub_shared_endpoint=jupyterhub_shared_endpoint,
cloud_provider=cloud_provider,
jupyterhub_profiles=self.config.profiles.dict()["jupyterlab"],
jupyterhub_image=_split_docker_image_name(
self.config.default_images.jupyterhub
Expand Down
6 changes: 6 additions & 0 deletions src/_nebari/stages/kubernetes_services/template/jupyterhub.tf
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ variable "idle-culler-settings" {
type = any
}

variable "cloud-provider" {
description = "Name of cloud provider."
type = string
}

module "kubernetes-nfs-server" {
count = var.jupyterhub-shared-endpoint == null ? 1 : 0
Expand Down Expand Up @@ -88,6 +92,8 @@ module "jupyterhub" {
name = var.name
namespace = var.environment

cloud-provider = var.cloud-provider

external-url = var.endpoint
realm_id = var.realm_id

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,12 @@ def list_dask_environments():


def base_node_group(options):
key = config["worker-node-group"]["key"]
if config.provider.value == "aws":
key = "dedicated"
default_node_group = {
config["worker-node-group"]["key"]: config["worker-node-group"]["value"]
key: config["worker-node-group"]["value"],
# config["worker-node-group"]["key"]: config["worker-node-group"]["value"],
}

# check `worker_extra_pod_config` first
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ resource "random_password" "jhub_apps_jwt_secret" {
}

locals {
jhub_apps_secrets_name = "jhub-apps-secrets"
jhub_apps_env_var_name = "JHUB_APP_JWT_SECRET_KEY"
jhub_apps_secrets_name = "jhub-apps-secrets"
jhub_apps_env_var_name = "JHUB_APP_JWT_SECRET_KEY"
singleuser_nodeselector_key = var.cloud-provider == "aws" ? "dedicated" : var.user-node-group.key
userscheduler_nodeselector_key = var.cloud-provider == "aws" ? "dedicated" : var.user-node-group.key
userscheduler_nodeselector_value = var.cloud-provider == "aws" ? var.general-node-group.value : var.user-node-group.value
}

resource "kubernetes_secret" "jhub_apps_secrets" {
Expand Down Expand Up @@ -174,14 +177,14 @@ resource "helm_release" "jupyterhub" {
singleuser = {
image = var.jupyterlab-image
nodeSelector = {
"${var.user-node-group.key}" = var.user-node-group.value
"${local.singleuser_nodeselector_key}" = var.user-node-group.value
}
}

scheduling = {
userScheduler = {
nodeSelector = {
"${var.user-node-group.key}" = var.user-node-group.value
"${local.userscheduler_nodeselector_key}" = local.userscheduler_nodeselector_value
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,11 @@ variable "jupyterlab-pioneer-log-format" {
type = string
}

variable "cloud-provider" {
description = "Name of cloud provider."
type = string
}

variable "initial-repositories" {
description = "Map of folder location and git repo url to clone"
type = string
Expand Down

0 comments on commit 0d1a30d

Please sign in to comment.