feat: handle upgrade job
This commit is contained in:
parent
89702d287a
commit
9b219d967e
@ -4,7 +4,7 @@ tmp_dir = "tmp"
|
||||
|
||||
[build]
|
||||
args_bin = []
|
||||
bin = "./tmp/main"
|
||||
bin = ";set -o allexport && source ./.env && set +o allexport; ./tmp/main"
|
||||
cmd = "go build -o ./tmp/main ./cmd/operator"
|
||||
delay = 1000
|
||||
exclude_dir = ["assets", "tmp", "vendor", "testdata"]
|
||||
|
||||
@ -1 +1,5 @@
|
||||
Tiltfile
|
||||
Dockerfile
|
||||
.env.example
|
||||
.git
|
||||
.gitignore
|
||||
@ -1 +1,2 @@
|
||||
FLINK_API_URL=127.0.0.1:8081
|
||||
FLINK_API_URL=127.0.0.1:8081
|
||||
SAVEPOINT_PATH=/opt/flink/savepoints
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,4 +1,5 @@
|
||||
db
|
||||
*.log
|
||||
tmp
|
||||
*.jar
|
||||
*.jar
|
||||
.env
|
||||
4
.vscode/launch.json
vendored
4
.vscode/launch.json
vendored
@ -9,6 +9,10 @@
|
||||
"type": "go",
|
||||
"request": "launch",
|
||||
"mode": "auto",
|
||||
"env": {
|
||||
"FLINK_API_URL": "127.0.0.1:8081",
|
||||
"SAVEPOINT_PATH": "/opt/flink/savepoints"
|
||||
},
|
||||
"cwd": "${workspaceFolder}",
|
||||
"program": "${workspaceFolder}/cmd/operator"
|
||||
}
|
||||
|
||||
@ -88,6 +88,8 @@ spec:
|
||||
lastRestoredSavepointRestoredDate:
|
||||
type: string
|
||||
format: time
|
||||
runningJarURI:
|
||||
type: string
|
||||
restoredCount:
|
||||
type: number
|
||||
additionalPrinterColumns:
|
||||
|
||||
@ -19,17 +19,18 @@ type FlinkJobSpec struct {
|
||||
}
|
||||
|
||||
type FlinkJobStatus struct {
|
||||
JobStatus JobStatus `json:"jobStatus,omitempty"`
|
||||
LifeCycleStatus *string `json:"lifeCycleStatus,omitempty"`
|
||||
LastSavepointPath *string `json:"lastSavepointPath,omitempty"`
|
||||
JarId *string `json:"jarId,omitempty"`
|
||||
JobId *string `json:"jobId,omitempty"`
|
||||
Error *string `json:"error,omitempty"`
|
||||
SavepointTriggerId *string `json:"savepointTriggerId,omitempty"`
|
||||
LastSavepointDate *time.Time `json:"lastSavepointDate,omitempty"`
|
||||
LastRestoredSavepointDate *time.Time `json:"lastRestoredSavepointDate,omitempty"`
|
||||
LastRestoredSavepointRestoredDate *time.Time `json:"lastRestoredSavepointRestoredDate,omitempty"`
|
||||
RestoredCount int `json:"restoredCount,omitempty"`
|
||||
JobStatus JobStatus `json:"jobStatus,omitempty"`
|
||||
LifeCycleStatus LifeCycleStatus `json:"lifeCycleStatus,omitempty"`
|
||||
LastSavepointPath *string `json:"lastSavepointPath,omitempty"`
|
||||
JarId *string `json:"jarId,omitempty"`
|
||||
JobId *string `json:"jobId,omitempty"`
|
||||
Error *string `json:"error,omitempty"`
|
||||
SavepointTriggerId *string `json:"savepointTriggerId,omitempty"`
|
||||
LastSavepointDate *time.Time `json:"lastSavepointDate,omitempty"`
|
||||
LastRestoredSavepointDate *time.Time `json:"lastRestoredSavepointDate,omitempty"`
|
||||
LastRestoredSavepointRestoredDate *time.Time `json:"lastRestoredSavepointRestoredDate,omitempty"`
|
||||
RestoredCount int `json:"restoredCount,omitempty"`
|
||||
RunningJarURI *string `json:"runningJarURI"`
|
||||
}
|
||||
|
||||
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
|
||||
@ -49,6 +50,7 @@ type FlinkJobList struct {
|
||||
|
||||
var (
|
||||
ErrNoJobId = errors.New("[managed-job] no job id")
|
||||
ErrNoJarId = errors.New("[managed-job] no jar id")
|
||||
ErrNoSavepointTriggerId = errors.New("[managed-job] no savepoint trigger id")
|
||||
ErrNoSavepointPath = errors.New("[managed-job] no savepoint path")
|
||||
)
|
||||
@ -75,6 +77,7 @@ type LifeCycleStatus string
|
||||
const (
|
||||
LifeCycleStatusInitializing LifeCycleStatus = "INITIALIZING"
|
||||
LifeCycleStatusRestoring LifeCycleStatus = "RESTORING"
|
||||
LifeCycleStatusUpgradeFailed LifeCycleStatus = "UPGRADE_FAILED"
|
||||
LifeCycleStatusUnhealthyJobManager LifeCycleStatus = "UNHEALTHY_JOB_MANAGER"
|
||||
LifeCycleStatusHealthy LifeCycleStatus = "HEALTHY"
|
||||
LifeCycleStatusFailed LifeCycleStatus = "FAILED"
|
||||
|
||||
@ -3,7 +3,6 @@ package crd
|
||||
import (
|
||||
"context"
|
||||
"flink-kube-operator/internal/crd/v1alpha1"
|
||||
"fmt"
|
||||
|
||||
"flink-kube-operator/pkg"
|
||||
|
||||
@ -51,8 +50,7 @@ func (crd Crd) watchFlinkJobs() rxgo.Observable {
|
||||
switch event.Type {
|
||||
case watch.Bookmark:
|
||||
case watch.Modified:
|
||||
pkg.Logger.Info("[crd] [watch] flink job updated")
|
||||
fmt.Printf("FlinkJob updated: %s\n", job.GetName())
|
||||
pkg.Logger.Info("[crd] [watch] flink job updated", zap.String("jobName", job.GetName()))
|
||||
crd.repsert(job)
|
||||
case watch.Added:
|
||||
pkg.Logger.Info("[crd] [watch] new flink job created")
|
||||
|
||||
@ -2,7 +2,6 @@ package managed_job
|
||||
|
||||
import (
|
||||
"flink-kube-operator/internal/crd/v1alpha1"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"flink-kube-operator/pkg"
|
||||
@ -10,83 +9,14 @@ import (
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// func (job *ManagedJob) startCycle() {
|
||||
// ticker := time.NewTicker(5 * time.Second)
|
||||
// quit := make(chan struct{})
|
||||
|
||||
// // load job state from db
|
||||
// job.loadState()
|
||||
// go func() {
|
||||
// for {
|
||||
// select {
|
||||
// case <-ticker.C:
|
||||
// job.cycle()
|
||||
// case <-quit:
|
||||
// ticker.Stop()
|
||||
// return
|
||||
// }
|
||||
// }
|
||||
// }()
|
||||
// }
|
||||
|
||||
func (job *ManagedJob) Cycle() {
|
||||
pkg.Logger.Debug("[managed-job] [new] check cycle", zap.String("jobKey", string(job.def.UID)))
|
||||
|
||||
// Init job
|
||||
if job.def.Status.JobStatus == "" {
|
||||
if job.def.Status.LastSavepointPath == nil {
|
||||
if job.def.Status.JarId == nil {
|
||||
err := job.upload()
|
||||
if err != nil {
|
||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||
"status": map[string]interface{}{
|
||||
"error": "[upload-error] " + err.Error(),
|
||||
},
|
||||
})
|
||||
return
|
||||
}
|
||||
}
|
||||
for {
|
||||
err := job.run()
|
||||
if err != nil {
|
||||
if strings.ContainsAny(err.Error(), ".jar does not exist") {
|
||||
err := job.upload()
|
||||
if err != nil {
|
||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||
"status": map[string]interface{}{
|
||||
"error": "[upload-error] " + err.Error(),
|
||||
},
|
||||
})
|
||||
return
|
||||
}
|
||||
continue
|
||||
}
|
||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||
"status": map[string]interface{}{
|
||||
"error": "[run-error] " + err.Error(),
|
||||
},
|
||||
})
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
||||
} else {
|
||||
job.restore()
|
||||
}
|
||||
if job.def.Status.LifeCycleStatus == "" && job.def.Status.JobStatus == "" {
|
||||
job.run()
|
||||
return
|
||||
}
|
||||
// job.crd.SetJobStatus(job.def.UID, v1alpha1.FlinkJobStatus{
|
||||
// JobStatus: job.def.Status.JobStatus,
|
||||
// })
|
||||
|
||||
// Check for set running or error state
|
||||
/* if job.state.Status == JobStatusCreating || job.state.Status == JobStatusFailing {
|
||||
err := job.checkStatus()
|
||||
if errors.Is(err, ErrNoJobId) {
|
||||
job.state = nil
|
||||
}
|
||||
return
|
||||
} */
|
||||
|
||||
if job.def.Status.JobStatus == v1alpha1.JobStatusRunning {
|
||||
if (job.def.Spec.SavepointInterval.Duration != 0) && ((job.def.Status.LastSavepointDate == nil) || time.Now().Add(-job.def.Spec.SavepointInterval.Duration).After(*job.def.Status.LastSavepointDate)) {
|
||||
@ -96,16 +26,20 @@ func (job *ManagedJob) Cycle() {
|
||||
job.trackSavepoint()
|
||||
}
|
||||
}
|
||||
if job.def.Status.RunningJarURI != nil && job.def.Spec.JarURI != *job.def.Status.RunningJarURI {
|
||||
job.upgrade()
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath != nil {
|
||||
job.restore()
|
||||
return
|
||||
}
|
||||
if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath == nil {
|
||||
job.restore()
|
||||
return
|
||||
}
|
||||
// if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath != nil {
|
||||
// //job.restore()
|
||||
// return
|
||||
// }
|
||||
// if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath == nil {
|
||||
// //job.restore()
|
||||
// return
|
||||
// }
|
||||
|
||||
pkg.Logger.Warn("[managed-job] [cycle]", zap.String("unhanded job status", string(job.def.Status.JobStatus)))
|
||||
}
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package managed_job
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"flink-kube-operator/internal/crd/v1alpha1"
|
||||
"strings"
|
||||
"time"
|
||||
@ -12,8 +11,8 @@ import (
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// restore the job from savepoint and jarId in managedJob
|
||||
func (job *ManagedJob) restore() error {
|
||||
// run the job from savepoint and jarId in managedJob
|
||||
func (job *ManagedJob) run() error {
|
||||
var savepointPath string
|
||||
if job.def.Status.LastSavepointPath == nil {
|
||||
pkg.Logger.Error("[managed-job] [restore]", zap.Error(v1alpha1.ErrNoSavepointPath))
|
||||
@ -21,39 +20,48 @@ func (job *ManagedJob) restore() error {
|
||||
} else {
|
||||
savepointPath = *job.def.Status.LastSavepointPath
|
||||
}
|
||||
if job.def.Status.JarId == nil {
|
||||
err := errors.New("missing jar id")
|
||||
pkg.Logger.Error("[managed-job] [restore]", zap.Error(err))
|
||||
return err
|
||||
}
|
||||
|
||||
pkg.Logger.Info("[managed-job] [restore] restoring job", zap.String("name", job.def.GetName()), zap.String("savepointPath", savepointPath))
|
||||
var jobId *string
|
||||
for {
|
||||
runJarResp, err := job.client.RunJar(api.RunOpts{
|
||||
JarID: *job.def.Status.JarId,
|
||||
AllowNonRestoredState: true,
|
||||
EntryClass: job.def.Spec.EntryClass,
|
||||
SavepointPath: savepointPath,
|
||||
})
|
||||
if err != nil {
|
||||
if strings.ContainsAny(err.Error(), ".jar does not exist") {
|
||||
err := job.upload()
|
||||
if err != nil {
|
||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||
"status": map[string]interface{}{
|
||||
"error": "[upload-error] " + err.Error(),
|
||||
},
|
||||
})
|
||||
return nil
|
||||
shouldUpload := false
|
||||
if job.def.Status.JarId == nil {
|
||||
err := v1alpha1.ErrNoJarId
|
||||
pkg.Logger.Error("[managed-job] [run]", zap.Error(err))
|
||||
shouldUpload = true
|
||||
} else {
|
||||
runJarResp, err := job.client.RunJar(api.RunOpts{
|
||||
JarID: *job.def.Status.JarId,
|
||||
AllowNonRestoredState: true,
|
||||
EntryClass: job.def.Spec.EntryClass,
|
||||
SavepointPath: savepointPath,
|
||||
})
|
||||
if err == nil {
|
||||
pkg.Logger.Info("[managed-job] [run] jar successfully ran", zap.Any("run-jar-resp", runJarResp))
|
||||
jobId = &runJarResp.JobId
|
||||
break
|
||||
} else {
|
||||
if strings.ContainsAny(err.Error(), ".jar does not exist") {
|
||||
shouldUpload = true
|
||||
} else {
|
||||
pkg.Logger.Error("[managed-job] [run] unhandled jar run Flink error", zap.Error(err))
|
||||
}
|
||||
continue
|
||||
}
|
||||
pkg.Logger.Error("[managed-job] [restore]", zap.Error(err))
|
||||
return err
|
||||
}
|
||||
jobId = &runJarResp.JobId
|
||||
pkg.Logger.Debug("[main] after run jar", zap.Any("run-jar-resp", runJarResp))
|
||||
break
|
||||
|
||||
if shouldUpload {
|
||||
err := job.upload()
|
||||
if err != nil {
|
||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||
"status": map[string]interface{}{
|
||||
"error": "[upload-error] " + err.Error(),
|
||||
},
|
||||
})
|
||||
return nil
|
||||
}
|
||||
continue
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// job.def.Status.JobId = &runJarResp.JobId
|
||||
@ -62,6 +70,7 @@ func (job *ManagedJob) restore() error {
|
||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||
"status": map[string]interface{}{
|
||||
"jobId": jobId,
|
||||
"runningJarURI": job.def.Spec.JarURI,
|
||||
"jobStatus": v1alpha1.JobStatusCreating,
|
||||
"lifeCycleStatus": v1alpha1.LifeCycleStatusRestoring,
|
||||
"lastRestoredSavepointDate": job.def.Status.LastSavepointDate,
|
||||
|
||||
28
internal/managed_job/upgrade.go
Normal file
28
internal/managed_job/upgrade.go
Normal file
@ -0,0 +1,28 @@
|
||||
package managed_job
|
||||
|
||||
import (
|
||||
"flink-kube-operator/internal/crd/v1alpha1"
|
||||
"flink-kube-operator/pkg"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
func (job *ManagedJob) upgrade() {
|
||||
if job.def.Status.LastSavepointPath != nil {
|
||||
pkg.Logger.Info("upgrading job ",
|
||||
zap.String("jobName", job.def.GetName()),
|
||||
zap.String("currentJarURI", job.def.Spec.JarURI),
|
||||
zap.String("prevJarURI", *job.def.Status.RunningJarURI),
|
||||
)
|
||||
job.run()
|
||||
} else {
|
||||
err := "There is no savepoint path existing"
|
||||
pkg.Logger.Error(err)
|
||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||
"status": map[string]interface{}{
|
||||
"lifeCycleStatus": v1alpha1.LifeCycleStatusUpgradeFailed,
|
||||
"error": err,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
36
internal/managed_job/upload.go
Normal file
36
internal/managed_job/upload.go
Normal file
@ -0,0 +1,36 @@
|
||||
package managed_job
|
||||
|
||||
import (
|
||||
"flink-kube-operator/internal/jar"
|
||||
|
||||
"flink-kube-operator/pkg"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// upload jar file and set the jarId for later usages
|
||||
func (job *ManagedJob) upload() error {
|
||||
jarFile, err := jar.NewJarFile(job.def.Spec.JarURI)
|
||||
if err != nil {
|
||||
pkg.Logger.Debug("[main] error on download jar", zap.Error(err))
|
||||
return err
|
||||
}
|
||||
jarId, err := jarFile.Upload(job.client)
|
||||
if err != nil {
|
||||
pkg.Logger.Debug("[main] error on upload jar", zap.Error(err))
|
||||
return err
|
||||
}
|
||||
err = jarFile.Delete()
|
||||
if err != nil {
|
||||
pkg.Logger.Debug("[main] error on delete jar", zap.Error(err))
|
||||
}
|
||||
pkg.Logger.Debug("[main] after upload jar", zap.Any("upload-jar-resp", jarId))
|
||||
|
||||
job.def.Status.JarId = &jarId
|
||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||
"status": map[string]interface{}{
|
||||
"jarId": job.def.Status.JarId,
|
||||
},
|
||||
})
|
||||
return nil
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user