feat: handle upgrade job
This commit is contained in:
parent
89702d287a
commit
9b219d967e
@ -4,7 +4,7 @@ tmp_dir = "tmp"
|
|||||||
|
|
||||||
[build]
|
[build]
|
||||||
args_bin = []
|
args_bin = []
|
||||||
bin = "./tmp/main"
|
bin = ";set -o allexport && source ./.env && set +o allexport; ./tmp/main"
|
||||||
cmd = "go build -o ./tmp/main ./cmd/operator"
|
cmd = "go build -o ./tmp/main ./cmd/operator"
|
||||||
delay = 1000
|
delay = 1000
|
||||||
exclude_dir = ["assets", "tmp", "vendor", "testdata"]
|
exclude_dir = ["assets", "tmp", "vendor", "testdata"]
|
||||||
|
|||||||
@ -1 +1,5 @@
|
|||||||
Tiltfile
|
Tiltfile
|
||||||
|
Dockerfile
|
||||||
|
.env.example
|
||||||
|
.git
|
||||||
|
.gitignore
|
||||||
@ -1 +1,2 @@
|
|||||||
FLINK_API_URL=127.0.0.1:8081
|
FLINK_API_URL=127.0.0.1:8081
|
||||||
|
SAVEPOINT_PATH=/opt/flink/savepoints
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -2,3 +2,4 @@ db
|
|||||||
*.log
|
*.log
|
||||||
tmp
|
tmp
|
||||||
*.jar
|
*.jar
|
||||||
|
.env
|
||||||
4
.vscode/launch.json
vendored
4
.vscode/launch.json
vendored
@ -9,6 +9,10 @@
|
|||||||
"type": "go",
|
"type": "go",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"mode": "auto",
|
"mode": "auto",
|
||||||
|
"env": {
|
||||||
|
"FLINK_API_URL": "127.0.0.1:8081",
|
||||||
|
"SAVEPOINT_PATH": "/opt/flink/savepoints"
|
||||||
|
},
|
||||||
"cwd": "${workspaceFolder}",
|
"cwd": "${workspaceFolder}",
|
||||||
"program": "${workspaceFolder}/cmd/operator"
|
"program": "${workspaceFolder}/cmd/operator"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -88,6 +88,8 @@ spec:
|
|||||||
lastRestoredSavepointRestoredDate:
|
lastRestoredSavepointRestoredDate:
|
||||||
type: string
|
type: string
|
||||||
format: time
|
format: time
|
||||||
|
runningJarURI:
|
||||||
|
type: string
|
||||||
restoredCount:
|
restoredCount:
|
||||||
type: number
|
type: number
|
||||||
additionalPrinterColumns:
|
additionalPrinterColumns:
|
||||||
|
|||||||
@ -19,17 +19,18 @@ type FlinkJobSpec struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type FlinkJobStatus struct {
|
type FlinkJobStatus struct {
|
||||||
JobStatus JobStatus `json:"jobStatus,omitempty"`
|
JobStatus JobStatus `json:"jobStatus,omitempty"`
|
||||||
LifeCycleStatus *string `json:"lifeCycleStatus,omitempty"`
|
LifeCycleStatus LifeCycleStatus `json:"lifeCycleStatus,omitempty"`
|
||||||
LastSavepointPath *string `json:"lastSavepointPath,omitempty"`
|
LastSavepointPath *string `json:"lastSavepointPath,omitempty"`
|
||||||
JarId *string `json:"jarId,omitempty"`
|
JarId *string `json:"jarId,omitempty"`
|
||||||
JobId *string `json:"jobId,omitempty"`
|
JobId *string `json:"jobId,omitempty"`
|
||||||
Error *string `json:"error,omitempty"`
|
Error *string `json:"error,omitempty"`
|
||||||
SavepointTriggerId *string `json:"savepointTriggerId,omitempty"`
|
SavepointTriggerId *string `json:"savepointTriggerId,omitempty"`
|
||||||
LastSavepointDate *time.Time `json:"lastSavepointDate,omitempty"`
|
LastSavepointDate *time.Time `json:"lastSavepointDate,omitempty"`
|
||||||
LastRestoredSavepointDate *time.Time `json:"lastRestoredSavepointDate,omitempty"`
|
LastRestoredSavepointDate *time.Time `json:"lastRestoredSavepointDate,omitempty"`
|
||||||
LastRestoredSavepointRestoredDate *time.Time `json:"lastRestoredSavepointRestoredDate,omitempty"`
|
LastRestoredSavepointRestoredDate *time.Time `json:"lastRestoredSavepointRestoredDate,omitempty"`
|
||||||
RestoredCount int `json:"restoredCount,omitempty"`
|
RestoredCount int `json:"restoredCount,omitempty"`
|
||||||
|
RunningJarURI *string `json:"runningJarURI"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
|
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
|
||||||
@ -49,6 +50,7 @@ type FlinkJobList struct {
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
ErrNoJobId = errors.New("[managed-job] no job id")
|
ErrNoJobId = errors.New("[managed-job] no job id")
|
||||||
|
ErrNoJarId = errors.New("[managed-job] no jar id")
|
||||||
ErrNoSavepointTriggerId = errors.New("[managed-job] no savepoint trigger id")
|
ErrNoSavepointTriggerId = errors.New("[managed-job] no savepoint trigger id")
|
||||||
ErrNoSavepointPath = errors.New("[managed-job] no savepoint path")
|
ErrNoSavepointPath = errors.New("[managed-job] no savepoint path")
|
||||||
)
|
)
|
||||||
@ -75,6 +77,7 @@ type LifeCycleStatus string
|
|||||||
const (
|
const (
|
||||||
LifeCycleStatusInitializing LifeCycleStatus = "INITIALIZING"
|
LifeCycleStatusInitializing LifeCycleStatus = "INITIALIZING"
|
||||||
LifeCycleStatusRestoring LifeCycleStatus = "RESTORING"
|
LifeCycleStatusRestoring LifeCycleStatus = "RESTORING"
|
||||||
|
LifeCycleStatusUpgradeFailed LifeCycleStatus = "UPGRADE_FAILED"
|
||||||
LifeCycleStatusUnhealthyJobManager LifeCycleStatus = "UNHEALTHY_JOB_MANAGER"
|
LifeCycleStatusUnhealthyJobManager LifeCycleStatus = "UNHEALTHY_JOB_MANAGER"
|
||||||
LifeCycleStatusHealthy LifeCycleStatus = "HEALTHY"
|
LifeCycleStatusHealthy LifeCycleStatus = "HEALTHY"
|
||||||
LifeCycleStatusFailed LifeCycleStatus = "FAILED"
|
LifeCycleStatusFailed LifeCycleStatus = "FAILED"
|
||||||
|
|||||||
@ -3,7 +3,6 @@ package crd
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"flink-kube-operator/internal/crd/v1alpha1"
|
"flink-kube-operator/internal/crd/v1alpha1"
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"flink-kube-operator/pkg"
|
"flink-kube-operator/pkg"
|
||||||
|
|
||||||
@ -51,8 +50,7 @@ func (crd Crd) watchFlinkJobs() rxgo.Observable {
|
|||||||
switch event.Type {
|
switch event.Type {
|
||||||
case watch.Bookmark:
|
case watch.Bookmark:
|
||||||
case watch.Modified:
|
case watch.Modified:
|
||||||
pkg.Logger.Info("[crd] [watch] flink job updated")
|
pkg.Logger.Info("[crd] [watch] flink job updated", zap.String("jobName", job.GetName()))
|
||||||
fmt.Printf("FlinkJob updated: %s\n", job.GetName())
|
|
||||||
crd.repsert(job)
|
crd.repsert(job)
|
||||||
case watch.Added:
|
case watch.Added:
|
||||||
pkg.Logger.Info("[crd] [watch] new flink job created")
|
pkg.Logger.Info("[crd] [watch] new flink job created")
|
||||||
|
|||||||
@ -2,7 +2,6 @@ package managed_job
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"flink-kube-operator/internal/crd/v1alpha1"
|
"flink-kube-operator/internal/crd/v1alpha1"
|
||||||
"strings"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"flink-kube-operator/pkg"
|
"flink-kube-operator/pkg"
|
||||||
@ -10,83 +9,14 @@ import (
|
|||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
)
|
)
|
||||||
|
|
||||||
// func (job *ManagedJob) startCycle() {
|
|
||||||
// ticker := time.NewTicker(5 * time.Second)
|
|
||||||
// quit := make(chan struct{})
|
|
||||||
|
|
||||||
// // load job state from db
|
|
||||||
// job.loadState()
|
|
||||||
// go func() {
|
|
||||||
// for {
|
|
||||||
// select {
|
|
||||||
// case <-ticker.C:
|
|
||||||
// job.cycle()
|
|
||||||
// case <-quit:
|
|
||||||
// ticker.Stop()
|
|
||||||
// return
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }()
|
|
||||||
// }
|
|
||||||
|
|
||||||
func (job *ManagedJob) Cycle() {
|
func (job *ManagedJob) Cycle() {
|
||||||
pkg.Logger.Debug("[managed-job] [new] check cycle", zap.String("jobKey", string(job.def.UID)))
|
pkg.Logger.Debug("[managed-job] [new] check cycle", zap.String("jobKey", string(job.def.UID)))
|
||||||
|
|
||||||
// Init job
|
// Init job
|
||||||
if job.def.Status.JobStatus == "" {
|
if job.def.Status.LifeCycleStatus == "" && job.def.Status.JobStatus == "" {
|
||||||
if job.def.Status.LastSavepointPath == nil {
|
job.run()
|
||||||
if job.def.Status.JarId == nil {
|
|
||||||
err := job.upload()
|
|
||||||
if err != nil {
|
|
||||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
|
||||||
"status": map[string]interface{}{
|
|
||||||
"error": "[upload-error] " + err.Error(),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for {
|
|
||||||
err := job.run()
|
|
||||||
if err != nil {
|
|
||||||
if strings.ContainsAny(err.Error(), ".jar does not exist") {
|
|
||||||
err := job.upload()
|
|
||||||
if err != nil {
|
|
||||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
|
||||||
"status": map[string]interface{}{
|
|
||||||
"error": "[upload-error] " + err.Error(),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
|
||||||
"status": map[string]interface{}{
|
|
||||||
"error": "[run-error] " + err.Error(),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
job.restore()
|
|
||||||
}
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// job.crd.SetJobStatus(job.def.UID, v1alpha1.FlinkJobStatus{
|
|
||||||
// JobStatus: job.def.Status.JobStatus,
|
|
||||||
// })
|
|
||||||
|
|
||||||
// Check for set running or error state
|
|
||||||
/* if job.state.Status == JobStatusCreating || job.state.Status == JobStatusFailing {
|
|
||||||
err := job.checkStatus()
|
|
||||||
if errors.Is(err, ErrNoJobId) {
|
|
||||||
job.state = nil
|
|
||||||
}
|
|
||||||
return
|
|
||||||
} */
|
|
||||||
|
|
||||||
if job.def.Status.JobStatus == v1alpha1.JobStatusRunning {
|
if job.def.Status.JobStatus == v1alpha1.JobStatusRunning {
|
||||||
if (job.def.Spec.SavepointInterval.Duration != 0) && ((job.def.Status.LastSavepointDate == nil) || time.Now().Add(-job.def.Spec.SavepointInterval.Duration).After(*job.def.Status.LastSavepointDate)) {
|
if (job.def.Spec.SavepointInterval.Duration != 0) && ((job.def.Status.LastSavepointDate == nil) || time.Now().Add(-job.def.Spec.SavepointInterval.Duration).After(*job.def.Status.LastSavepointDate)) {
|
||||||
@ -96,16 +26,20 @@ func (job *ManagedJob) Cycle() {
|
|||||||
job.trackSavepoint()
|
job.trackSavepoint()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if job.def.Status.RunningJarURI != nil && job.def.Spec.JarURI != *job.def.Status.RunningJarURI {
|
||||||
|
job.upgrade()
|
||||||
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath != nil {
|
// if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath != nil {
|
||||||
job.restore()
|
// //job.restore()
|
||||||
return
|
// return
|
||||||
}
|
// }
|
||||||
if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath == nil {
|
// if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath == nil {
|
||||||
job.restore()
|
// //job.restore()
|
||||||
return
|
// return
|
||||||
}
|
// }
|
||||||
|
|
||||||
pkg.Logger.Warn("[managed-job] [cycle]", zap.String("unhanded job status", string(job.def.Status.JobStatus)))
|
pkg.Logger.Warn("[managed-job] [cycle]", zap.String("unhanded job status", string(job.def.Status.JobStatus)))
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
package managed_job
|
package managed_job
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
|
||||||
"flink-kube-operator/internal/crd/v1alpha1"
|
"flink-kube-operator/internal/crd/v1alpha1"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@ -12,8 +11,8 @@ import (
|
|||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
)
|
)
|
||||||
|
|
||||||
// restore the job from savepoint and jarId in managedJob
|
// run the job from savepoint and jarId in managedJob
|
||||||
func (job *ManagedJob) restore() error {
|
func (job *ManagedJob) run() error {
|
||||||
var savepointPath string
|
var savepointPath string
|
||||||
if job.def.Status.LastSavepointPath == nil {
|
if job.def.Status.LastSavepointPath == nil {
|
||||||
pkg.Logger.Error("[managed-job] [restore]", zap.Error(v1alpha1.ErrNoSavepointPath))
|
pkg.Logger.Error("[managed-job] [restore]", zap.Error(v1alpha1.ErrNoSavepointPath))
|
||||||
@ -21,39 +20,48 @@ func (job *ManagedJob) restore() error {
|
|||||||
} else {
|
} else {
|
||||||
savepointPath = *job.def.Status.LastSavepointPath
|
savepointPath = *job.def.Status.LastSavepointPath
|
||||||
}
|
}
|
||||||
if job.def.Status.JarId == nil {
|
|
||||||
err := errors.New("missing jar id")
|
|
||||||
pkg.Logger.Error("[managed-job] [restore]", zap.Error(err))
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
pkg.Logger.Info("[managed-job] [restore] restoring job", zap.String("name", job.def.GetName()), zap.String("savepointPath", savepointPath))
|
pkg.Logger.Info("[managed-job] [restore] restoring job", zap.String("name", job.def.GetName()), zap.String("savepointPath", savepointPath))
|
||||||
var jobId *string
|
var jobId *string
|
||||||
for {
|
for {
|
||||||
runJarResp, err := job.client.RunJar(api.RunOpts{
|
shouldUpload := false
|
||||||
JarID: *job.def.Status.JarId,
|
if job.def.Status.JarId == nil {
|
||||||
AllowNonRestoredState: true,
|
err := v1alpha1.ErrNoJarId
|
||||||
EntryClass: job.def.Spec.EntryClass,
|
pkg.Logger.Error("[managed-job] [run]", zap.Error(err))
|
||||||
SavepointPath: savepointPath,
|
shouldUpload = true
|
||||||
})
|
} else {
|
||||||
if err != nil {
|
runJarResp, err := job.client.RunJar(api.RunOpts{
|
||||||
if strings.ContainsAny(err.Error(), ".jar does not exist") {
|
JarID: *job.def.Status.JarId,
|
||||||
err := job.upload()
|
AllowNonRestoredState: true,
|
||||||
if err != nil {
|
EntryClass: job.def.Spec.EntryClass,
|
||||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
SavepointPath: savepointPath,
|
||||||
"status": map[string]interface{}{
|
})
|
||||||
"error": "[upload-error] " + err.Error(),
|
if err == nil {
|
||||||
},
|
pkg.Logger.Info("[managed-job] [run] jar successfully ran", zap.Any("run-jar-resp", runJarResp))
|
||||||
})
|
jobId = &runJarResp.JobId
|
||||||
return nil
|
break
|
||||||
|
} else {
|
||||||
|
if strings.ContainsAny(err.Error(), ".jar does not exist") {
|
||||||
|
shouldUpload = true
|
||||||
|
} else {
|
||||||
|
pkg.Logger.Error("[managed-job] [run] unhandled jar run Flink error", zap.Error(err))
|
||||||
}
|
}
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
pkg.Logger.Error("[managed-job] [restore]", zap.Error(err))
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
jobId = &runJarResp.JobId
|
|
||||||
pkg.Logger.Debug("[main] after run jar", zap.Any("run-jar-resp", runJarResp))
|
if shouldUpload {
|
||||||
break
|
err := job.upload()
|
||||||
|
if err != nil {
|
||||||
|
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||||
|
"status": map[string]interface{}{
|
||||||
|
"error": "[upload-error] " + err.Error(),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// job.def.Status.JobId = &runJarResp.JobId
|
// job.def.Status.JobId = &runJarResp.JobId
|
||||||
@ -62,6 +70,7 @@ func (job *ManagedJob) restore() error {
|
|||||||
job.crd.Patch(job.def.UID, map[string]interface{}{
|
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||||
"status": map[string]interface{}{
|
"status": map[string]interface{}{
|
||||||
"jobId": jobId,
|
"jobId": jobId,
|
||||||
|
"runningJarURI": job.def.Spec.JarURI,
|
||||||
"jobStatus": v1alpha1.JobStatusCreating,
|
"jobStatus": v1alpha1.JobStatusCreating,
|
||||||
"lifeCycleStatus": v1alpha1.LifeCycleStatusRestoring,
|
"lifeCycleStatus": v1alpha1.LifeCycleStatusRestoring,
|
||||||
"lastRestoredSavepointDate": job.def.Status.LastSavepointDate,
|
"lastRestoredSavepointDate": job.def.Status.LastSavepointDate,
|
||||||
|
|||||||
28
internal/managed_job/upgrade.go
Normal file
28
internal/managed_job/upgrade.go
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
package managed_job
|
||||||
|
|
||||||
|
import (
|
||||||
|
"flink-kube-operator/internal/crd/v1alpha1"
|
||||||
|
"flink-kube-operator/pkg"
|
||||||
|
|
||||||
|
"go.uber.org/zap"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (job *ManagedJob) upgrade() {
|
||||||
|
if job.def.Status.LastSavepointPath != nil {
|
||||||
|
pkg.Logger.Info("upgrading job ",
|
||||||
|
zap.String("jobName", job.def.GetName()),
|
||||||
|
zap.String("currentJarURI", job.def.Spec.JarURI),
|
||||||
|
zap.String("prevJarURI", *job.def.Status.RunningJarURI),
|
||||||
|
)
|
||||||
|
job.run()
|
||||||
|
} else {
|
||||||
|
err := "There is no savepoint path existing"
|
||||||
|
pkg.Logger.Error(err)
|
||||||
|
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||||
|
"status": map[string]interface{}{
|
||||||
|
"lifeCycleStatus": v1alpha1.LifeCycleStatusUpgradeFailed,
|
||||||
|
"error": err,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
36
internal/managed_job/upload.go
Normal file
36
internal/managed_job/upload.go
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
package managed_job
|
||||||
|
|
||||||
|
import (
|
||||||
|
"flink-kube-operator/internal/jar"
|
||||||
|
|
||||||
|
"flink-kube-operator/pkg"
|
||||||
|
|
||||||
|
"go.uber.org/zap"
|
||||||
|
)
|
||||||
|
|
||||||
|
// upload jar file and set the jarId for later usages
|
||||||
|
func (job *ManagedJob) upload() error {
|
||||||
|
jarFile, err := jar.NewJarFile(job.def.Spec.JarURI)
|
||||||
|
if err != nil {
|
||||||
|
pkg.Logger.Debug("[main] error on download jar", zap.Error(err))
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
jarId, err := jarFile.Upload(job.client)
|
||||||
|
if err != nil {
|
||||||
|
pkg.Logger.Debug("[main] error on upload jar", zap.Error(err))
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
err = jarFile.Delete()
|
||||||
|
if err != nil {
|
||||||
|
pkg.Logger.Debug("[main] error on delete jar", zap.Error(err))
|
||||||
|
}
|
||||||
|
pkg.Logger.Debug("[main] after upload jar", zap.Any("upload-jar-resp", jarId))
|
||||||
|
|
||||||
|
job.def.Status.JarId = &jarId
|
||||||
|
job.crd.Patch(job.def.UID, map[string]interface{}{
|
||||||
|
"status": map[string]interface{}{
|
||||||
|
"jarId": job.def.Status.JarId,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
return nil
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user