|
| 1 | +package org.thp.cortex.services |
| 2 | + |
| 3 | +import java.util.concurrent.TimeUnit |
| 4 | +import java.nio.file._ |
| 5 | + |
| 6 | +import scala.concurrent.duration.FiniteDuration |
| 7 | +import scala.concurrent.{ExecutionContext, Future} |
| 8 | +import scala.util.{Try, Success, Failure} |
| 9 | +import scala.collection.JavaConverters._ |
| 10 | + |
| 11 | +import play.api.{Configuration, Logger} |
| 12 | + |
| 13 | +import akka.actor.ActorSystem |
| 14 | +import io.fabric8.kubernetes.client.{DefaultKubernetesClient} |
| 15 | +import io.fabric8.kubernetes.api.model.batch.{JobBuilder => KJobBuilder} |
| 16 | +import io.fabric8.kubernetes.api.model.{PersistentVolumeClaimVolumeSourceBuilder} |
| 17 | +import javax.inject.{Inject, Singleton} |
| 18 | +import org.thp.cortex.models._ |
| 19 | + |
| 20 | +@Singleton |
| 21 | +class K8sJobRunnerSrv( |
| 22 | + client: DefaultKubernetesClient, |
| 23 | + autoUpdate: Boolean, |
| 24 | + jobBaseDirectory: Path, |
| 25 | + persistentVolumeClaimName: String, |
| 26 | + implicit val system: ActorSystem |
| 27 | +) { |
| 28 | + |
| 29 | + @Inject() |
| 30 | + def this(config: Configuration, system: ActorSystem) = |
| 31 | + this( |
| 32 | + new DefaultKubernetesClient(), |
| 33 | + config.getOptional[Boolean]("job.kubernetes.autoUpdate").getOrElse(true), |
| 34 | + Paths.get(config.get[String]("job.directory")), |
| 35 | + config.get[String]("job.kubernetes.persistentVolumeClaimName"), |
| 36 | + system: ActorSystem |
| 37 | + ) |
| 38 | + |
| 39 | + lazy val logger = Logger(getClass) |
| 40 | + |
| 41 | + lazy val isAvailable: Boolean = |
| 42 | + Try { |
| 43 | + val ver = client.getVersion() |
| 44 | + logger.info(s"Kubernetes is available: major ${ver.getMajor()} minor ${ver.getMinor()} git ${ver.getGitCommit()}") |
| 45 | + true |
| 46 | + }.recover { |
| 47 | + case error => |
| 48 | + logger.info(s"Kubernetes is not available", error) |
| 49 | + false |
| 50 | + }.get |
| 51 | + |
| 52 | + def run(jobDirectory: Path, dockerImage: String, job: Job, timeout: Option[FiniteDuration])(implicit ec: ExecutionContext): Future[Unit] = { |
| 53 | + val cacertsFile = jobDirectory.resolve("input").resolve("cacerts") |
| 54 | + val relativeJobDirectory = jobBaseDirectory.relativize(jobDirectory).toString() |
| 55 | + // make the default longer than likely values, but still not infinite |
| 56 | + val timeout_or_default = timeout getOrElse new FiniteDuration(8, TimeUnit.HOURS) |
| 57 | + // https://kubernetes.io/docs/concepts/overview/working-with-objects/names/ |
| 58 | + // FIXME: this collapses case, jeopardizing the uniqueness of the |
| 59 | + // identifier. LDH: lowercase, digits, hyphens. |
| 60 | + val ldh_jobid = "_".r.replaceAllIn(job.id.map(_.toLower), "-") |
| 61 | + val kjobName = "neuron-job-" + ldh_jobid |
| 62 | + val pvcvs = new PersistentVolumeClaimVolumeSourceBuilder() |
| 63 | + .withClaimName(persistentVolumeClaimName) |
| 64 | + .withReadOnly(false) |
| 65 | + .build(); |
| 66 | + val kjob1 = new KJobBuilder() |
| 67 | + .withApiVersion("batch/v1") |
| 68 | + .withNewMetadata() |
| 69 | + .withName(kjobName) |
| 70 | + .withLabels(Map( |
| 71 | + "cortex-job-id" -> job.id, |
| 72 | + "cortex-worker-id" -> job.workerId(), |
| 73 | + "cortex-neuron-job" -> "true").asJava) |
| 74 | + .endMetadata() |
| 75 | + .withNewSpec() |
| 76 | + .withNewTemplate() |
| 77 | + .withNewSpec() |
| 78 | + .addNewVolume() |
| 79 | + .withName("job-directory") |
| 80 | + .withPersistentVolumeClaim(pvcvs) |
| 81 | + .endVolume() |
| 82 | + .addNewContainer() |
| 83 | + .withName("neuron") |
| 84 | + .withImage(dockerImage) |
| 85 | + .withArgs("/job") |
| 86 | + .addNewEnv() |
| 87 | + .withName("CORTEX_JOB_FOLDER") |
| 88 | + .withValue(relativeJobDirectory) |
| 89 | + .endEnv(); |
| 90 | + val kjob2 = if (Files.exists(cacertsFile)) { |
| 91 | + kjob1.addNewEnv() |
| 92 | + .withName("REQUESTS_CA_BUNDLE") |
| 93 | + .withValue("/job/input/cacerts") |
| 94 | + .endEnv() |
| 95 | + } else { |
| 96 | + kjob1 |
| 97 | + } |
| 98 | + val kjob3 = kjob2 |
| 99 | + .addNewVolumeMount() |
| 100 | + .withName("job-directory") |
| 101 | + .withSubPathExpr("$(CORTEX_JOB_FOLDER)/input") |
| 102 | + .withMountPath("/job/input") |
| 103 | + .withReadOnly(true) |
| 104 | + .endVolumeMount() |
| 105 | + .addNewVolumeMount() |
| 106 | + .withName("job-directory") |
| 107 | + .withSubPathExpr("$(CORTEX_JOB_FOLDER)/output") |
| 108 | + .withMountPath("/job/output") |
| 109 | + .withReadOnly(false) |
| 110 | + .endVolumeMount() |
| 111 | + .endContainer() |
| 112 | + .withRestartPolicy("Never") |
| 113 | + .endSpec() |
| 114 | + .endTemplate() |
| 115 | + .endSpec() |
| 116 | + .build(); |
| 117 | + |
| 118 | + val execution = Future { |
| 119 | + val created_kjob = client.batch().jobs().create(kjob3) |
| 120 | + val created_env = created_kjob |
| 121 | + .getSpec().getTemplate().getSpec().getContainers().get(0) |
| 122 | + .getEnv().asScala; |
| 123 | + logger.info( |
| 124 | + s"Created Kubernetes Job ${created_kjob.getMetadata().getName()}\n" + |
| 125 | + s" timeout: ${timeout_or_default.toString}\n" + |
| 126 | + s" image : $dockerImage\n" + |
| 127 | + s" mount : pvc ${persistentVolumeClaimName} subdir ${relativeJobDirectory} as /job" + |
| 128 | + created_env.map(ev => s"\n env : ${ev.getName()} = ${ev.getValue()}").mkString) |
| 129 | + val ended_kjob = client.batch().jobs().withLabel("cortex-job-id", job.id) |
| 130 | + .waitUntilCondition((x => Option(x).flatMap(j => |
| 131 | + Option(j.getStatus).flatMap(s => |
| 132 | + Some(s.getConditions.asScala.map(_.getType).filter(t => |
| 133 | + t.equals("Complete") || t.equals("Failed")).nonEmpty))) |
| 134 | + getOrElse false), |
| 135 | + timeout_or_default.length, timeout_or_default.unit); |
| 136 | + if(ended_kjob != null) { |
| 137 | + logger.info(s"Kubernetes Job ${ended_kjob.getMetadata().getName()} " + |
| 138 | + s"(for job ${job.id}) status is now ${ended_kjob.getStatus().toString()}") |
| 139 | + } else { |
| 140 | + logger.info(s"Kubernetes Job for ${job.id} no longer exists") |
| 141 | + } |
| 142 | + }.andThen { |
| 143 | + // let's find the job by the attribute we know is fundamentally |
| 144 | + // unique, rather than one constructed from it |
| 145 | + case Success(r) => |
| 146 | + val deleted = client.batch().jobs().withLabel("cortex-job-id", job.id).delete() |
| 147 | + if(deleted) { |
| 148 | + logger.info(s"Deleted Kubernetes Job for job ${job.id}") |
| 149 | + } else { |
| 150 | + logger.info(s"While trying to delete Kubernetes Job for ${job.id}, the job was not found; this is OK") |
| 151 | + } |
| 152 | + Future {} |
| 153 | + case Failure(t) => |
| 154 | + logger.warn(s"Some problem happened; not deleting Kubernetes Job for job ${job.id}") |
| 155 | + Future {} |
| 156 | + } |
| 157 | + execution |
| 158 | + } |
| 159 | +} |
0 commit comments