Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add aws_launch_template & config vars to enable options to specify pre-bootstrap commands and/or custom AMI for EKS nodes #2621

Closed
wants to merge 9 commits into from
6 changes: 6 additions & 0 deletions src/_nebari/stages/infrastructure/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ class AzureInputVars(schema.Base):
class AWSNodeGroupInputVars(schema.Base):
name: str
instance_type: str
custom_ami: Optional[str] = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to follow the aws standard naming, let's use ami_type

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hello @viniciusdc,
in this case, the custom_ami prompts the user to input an AMI Id, as opposed to the ami_type (e.g. "CUSTOM"). The corresponding aws var here is image_id. My impression of image_id is that it seems a bit ambiguous for nebari config purposes since it does not make it obvious that an ami is expected.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viniciusdc how about using ami_image_id ?

Copy link
Contributor

@viniciusdc viniciusdc Aug 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@joneszc I see; I was under the impression ami_type would support the ID of a custom AMI as well, but after your comment and re-reding the docs, that does not seem to be true (unfortunately). I agree with you that image_id would also be ambiguous, considering the overall shape of the nebari-config.

gpu: bool = False
min_size: int
desired_size: int
Expand All @@ -133,6 +134,7 @@ class AWSInputVars(schema.Base):
existing_subnet_ids: Optional[List[str]] = None
region: str
kubernetes_version: str
node_prebootstrap_command: Optional[str] = None
node_groups: List[AWSNodeGroupInputVars]
availability_zones: List[str]
vpc_cidr_block: str
Expand Down Expand Up @@ -428,6 +430,7 @@ def _validate_tags(cls, value: Optional[Dict[str, str]]) -> Dict[str, str]:

class AWSNodeGroup(schema.Base):
instance: str
custom_ami: Optional[str] = None
min_nodes: int = 0
max_nodes: int
gpu: bool = False
Expand All @@ -451,6 +454,7 @@ class AmazonWebServicesProvider(schema.Base):
kubernetes_version: str
availability_zones: Optional[List[str]]
node_groups: Dict[str, AWSNodeGroup] = DEFAULT_AWS_NODE_GROUPS
node_prebootstrap_command: Optional[str] = None
existing_subnet_ids: Optional[List[str]] = None
existing_security_group_id: Optional[str] = None
vpc_cidr_block: str = "10.10.0.0/16"
Expand Down Expand Up @@ -789,6 +793,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]):
return AWSInputVars(
name=self.config.escaped_project_name,
environment=self.config.namespace,
node_prebootstrap_command=self.config.amazon_web_services.node_prebootstrap_command,
existing_subnet_ids=self.config.amazon_web_services.existing_subnet_ids,
existing_security_group_id=self.config.amazon_web_services.existing_security_group_id,
region=self.config.amazon_web_services.region,
Expand All @@ -797,6 +802,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]):
AWSNodeGroupInputVars(
name=name,
instance_type=node_group.instance,
custom_ami=node_group.custom_ami,
gpu=node_group.gpu,
min_size=node_group.min_nodes,
desired_size=node_group.min_nodes,
Expand Down
1 change: 1 addition & 0 deletions src/_nebari/stages/infrastructure/template/aws/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ module "kubernetes" {
node_groups = var.node_groups

endpoint_private_access = var.eks_endpoint_private_access
node_prebootstrap_command = var.node_prebootstrap_command
public_access_cidrs = var.eks_public_access_cidrs
permissions_boundary = var.permissions_boundary
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ locals {
], var.node_group_additional_policies)

gpu_node_group_names = [for node_group in var.node_groups : node_group.name if node_group.gpu == true]
cust_ami_node_index = [for idx, node_group in var.node_groups : idx if node_group.custom_ami != null]

partition = data.aws_partition.current.partition
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,46 @@ resource "aws_eks_cluster" "main" {
tags = merge({ Name = var.name }, var.tags)
}

## aws_launch_template user_data invocation
## If using a Custom AMI, then the /etc/eks/bootstrap cmds and args must be included/modified,
## otherwise, on default AWS EKS Node AMI, the bootstrap cmd is appended automatically
resource "aws_launch_template" "main" {
# Invoke launch_template only if var.node_prebootstrap_command is not null or custom_ami is not null
count = var.node_prebootstrap_command != null ? length(var.node_groups) : length(local.cust_ami_node_index)
name = var.node_prebootstrap_command != null ? var.node_groups[count.index].name : var.node_groups[local.cust_ami_node_index[count.index]].name
image_id = var.node_prebootstrap_command != null ? var.node_groups[count.index].custom_ami : var.node_groups[local.cust_ami_node_index[count.index]].custom_ami

vpc_security_group_ids = var.cluster_security_groups

block_device_mappings {
device_name = "/dev/xvda"

ebs {
volume_size = 50
volume_type = "gp2"
}
}
# https://docs.aws.amazon.com/eks/latest/userguide/launch-templates.html#launch-template-basics
user_data = base64encode(<<-EOF
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="==MYBOUNDARY=="

--==MYBOUNDARY==
Content-Type: text/x-shellscript; charset="us-ascii"
%{ if var.node_prebootstrap_command != null }${var.node_prebootstrap_command}%{ endif }

%{ if var.node_prebootstrap_command != null && var.node_groups[count.index].custom_ami != null }--==MYBOUNDARY==%{ endif }
%{ if var.node_prebootstrap_command != null && var.node_groups[count.index].custom_ami != null }Content-Type: text/x-shellscript; charset="us-ascii"%{ endif }
%{ if var.node_prebootstrap_command != null && var.node_groups[count.index].custom_ami == null }%{ else }#!/bin/bash%{ endif }
%{ if var.node_prebootstrap_command != null && var.node_groups[count.index].custom_ami == null }%{ else }set -ex%{ endif }
%{ if var.node_prebootstrap_command != null && var.node_groups[count.index].custom_ami == null }%{ else }B64_CLUSTER_CA=${aws_eks_cluster.main.certificate_authority[0].data}%{ endif }
%{ if var.node_prebootstrap_command != null && var.node_groups[count.index].custom_ami == null }%{ else }API_SERVER_URL=${aws_eks_cluster.main.endpoint}%{ endif }
%{ if var.node_prebootstrap_command != null && var.node_groups[count.index].custom_ami == null }%{ else }/etc/eks/bootstrap.sh ${aws_eks_cluster.main.name} --b64-cluster-ca $B64_CLUSTER_CA --apiserver-endpoint $API_SERVER_URL%{ endif }

--==MYBOUNDARY==--
EOF
Copy link
Contributor

@viniciusdc viniciusdc Aug 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest moving this into /files as a .tpl file and reading it using templatefile(path, vars) , e.g :

base64encode(templatefile("${path.module}/files/user_data.tpl", {
    node_prebootstrap_command = var.node_prebootstrap_command
    node_groups = var.node_groups
    aws_eks_cluster = aws_eks_cluster.main
    count_index = count.index
    cust_ami_node_index = local.cust_ami_node_index
  }))

as this will become easier to maintain

Copy link
Contributor Author

@joneszc joneszc Aug 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viniciusdc

I've implemented the suggested modification with updated main.tf and added files/user_data.tftpl

)
}

resource "aws_eks_node_group" "main" {
count = length(var.node_groups)
Expand All @@ -30,8 +70,8 @@ resource "aws_eks_node_group" "main" {
subnet_ids = var.node_groups[count.index].single_subnet ? [element(var.cluster_subnets, 0)] : var.cluster_subnets

instance_types = [var.node_groups[count.index].instance_type]
ami_type = var.node_groups[count.index].gpu == true ? "AL2_x86_64_GPU" : "AL2_x86_64"
disk_size = 50
ami_type = var.node_groups[count.index].custom_ami != null ? "CUSTOM" : (var.node_groups[count.index].gpu == true ? "AL2_x86_64_GPU" : "AL2_x86_64")
disk_size = var.node_prebootstrap_command == null && var.node_groups[count.index].custom_ami == null ? 50 : null
Comment on lines +73 to +74
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is nothing wrong with the way it is right now, but it would be better if "AL2_x86_64_GPU" and "AL2_x86_64" were set as defaults in the python schema itself, so that we could clean this conditionals a bit

Copy link
Contributor Author

@joneszc joneszc Aug 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viniciusdc
ami_type (i.e. AL2_x86_64_GPU/AL2_x86_64) is not fetched from nebari-config.yaml or otherwise processed in __init__.py in either the current state of Nebari or in the scope of the proposed PR. Adding ami_type to the python schema would require a separate update.


scaling_config {
min_size = var.node_groups[count.index].min_size
Expand All @@ -49,6 +89,15 @@ resource "aws_eks_node_group" "main" {
]
}

# Invoke launch_template only if var.node_prebootstrap_command is not null or node group custom_ami is not null
dynamic "launch_template" {
for_each = var.node_prebootstrap_command == null && var.node_groups[count.index].custom_ami == null ? [] : [1]
content {
id = var.node_prebootstrap_command != null ? aws_launch_template.main[count.index].id : aws_launch_template.main[index(local.cust_ami_node_index, count.index)].id
version = var.node_prebootstrap_command != null ? aws_launch_template.main[count.index].latest_version : aws_launch_template.main[index(local.cust_ami_node_index, count.index)].latest_version
}
}

# Ensure that IAM Role permissions are created before and deleted
# after EKS Node Group handling. Otherwise, EKS will not be able to
# properly delete EC2 Instances and Elastic Network Interfaces.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ variable "node_groups" {
type = list(object({
name = string
instance_type = string
custom_ami = string
gpu = bool
min_size = number
desired_size = number
Expand All @@ -60,6 +61,12 @@ variable "node_group_instance_type" {
default = "m5.large"
}

variable "node_prebootstrap_command" {
description = "Custom pre-bootstrap commands run on EKS nodes"
type = string
default = null
}

variable "endpoint_private_access" {
type = bool
default = false
Expand Down
7 changes: 7 additions & 0 deletions src/_nebari/stages/infrastructure/template/aws/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ variable "node_groups" {
type = list(object({
name = string
instance_type = string
custom_ami = string
gpu = bool
min_size = number
desired_size = number
Expand All @@ -56,6 +57,12 @@ variable "kubeconfig_filename" {
type = string
}

variable "node_prebootstrap_command" {
description = "Custom pre-bootstrap commands run on EKS nodes"
type = string
default = null
}

variable "eks_endpoint_private_access" {
type = bool
default = false
Expand Down