Skip to content

Commit

Permalink
pod belonging bug (#27)
Browse files Browse the repository at this point in the history
* docs: update status machine

* fix: specify pod by task name instead of task type

* chore: update version to v1.1.3
  • Loading branch information
konnase authored Aug 22, 2022
1 parent 50ada3d commit 415f4ef
Show file tree
Hide file tree
Showing 10 changed files with 16 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ name: Release
on: [push]

env:
version: v1.1.2
version: v1.1.3

jobs:
docker:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

# di-operator version
APP_VERSION ?= 0.1.0
VERSION ?= v1.1.2
VERSION ?= v1.1.3
MASTER_VERSION := $(VERSION)

COMMIT_SHORT_SHA=$(shell git log -n 1 | head -n 1 | sed -e 's/^commit //' | head -c 8)
Expand Down
2 changes: 1 addition & 1 deletion chart/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ type: application

# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
version: 1.1.2
version: 1.1.3

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application.
Expand Down
2 changes: 1 addition & 1 deletion chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Declare variables to be passed into your templates.

# tag for pytorch-operator image
tag: v1.1.2
tag: v1.1.3

# tag for di-orchestrator image
registry: opendilab
Expand Down
4 changes: 2 additions & 2 deletions config/di-manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5214,7 +5214,7 @@ spec:
envFrom:
- configMapRef:
name: di-config
image: opendilab/di-orchestrator:v1.1.2
image: opendilab/di-orchestrator:v1.1.3
imagePullPolicy: Always
livenessProbe:
httpGet:
Expand Down Expand Up @@ -5270,7 +5270,7 @@ spec:
envFrom:
- configMapRef:
name: di-config
image: opendilab/di-orchestrator:v1.1.2
image: opendilab/di-orchestrator:v1.1.3
imagePullPolicy: Always
livenessProbe:
httpGet:
Expand Down
2 changes: 1 addition & 1 deletion config/manager/di_operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ spec:
- "--leader-elect"
- "--qps=100"
- "--burst=200"
image: opendilab/di-orchestrator:v1.1.2
image: opendilab/di-orchestrator:v1.1.3
imagePullPolicy: Always
name: manager
envFrom:
Expand Down
2 changes: 1 addition & 1 deletion config/manager/di_server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ spec:
- "--server-bind-address=:8081"
- "--qps=100"
- "--burst=200"
image: opendilab/di-orchestrator:v1.1.2
image: opendilab/di-orchestrator:v1.1.3
imagePullPolicy: Always
name: server
envFrom:
Expand Down
2 changes: 1 addition & 1 deletion config/manager/di_webhook.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ spec:
- "--port=9443"
- "--qps=100"
- "--burst=200"
image: opendilab/di-orchestrator:v1.1.2
image: opendilab/di-orchestrator:v1.1.3
imagePullPolicy: Always
name: webhook
securityContext:
Expand Down
2 changes: 1 addition & 1 deletion config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ kind: Kustomization
images:
- name: opendilab/di-orchestrator
newName: opendilab/di-orchestrator
newTag: v1.1.2
newTag: v1.1.3
7 changes: 6 additions & 1 deletion pkg/controllers/dijob.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ func (r *DIJobReconciler) createMissedReplicas(ctx context.Context, job *div2alp
for index, task := range job.Spec.Tasks {
localTrace := make([]bool, task.Replicas)
for _, pod := range pods {
if pod.Labels[dicommon.LabelTaskType] != string(task.Type) {
if pod.Labels[dicommon.LabelTaskName] != string(task.Name) {
continue
}
localRank, err := strconv.Atoi(pod.Annotations[dicommon.AnnotationTaskRank])
Expand All @@ -288,6 +288,11 @@ func (r *DIJobReconciler) createMissedReplicas(ctx context.Context, job *div2alp
pod.Annotations[dicommon.AnnotationTaskRank]))
return err
}
if localRank >= len(localTrace) {
err := fmt.Errorf("pod %s local rank is out of range", pod.Name)
log.Error(err, "get local rank")
return err
}
localTrace[localRank] = true
}
for localRank := 0; localRank < int(task.Replicas); localRank++ {
Expand Down

0 comments on commit 415f4ef

Please sign in to comment.