public final class K8sWorkerUtils
extends java.lang.Object
Modifier and Type | Method and Description |
---|---|
static int |
calculateWorkerID(JobAPI.Job job,
java.lang.String podName,
java.lang.String containerName)
calculate the workerID from the given parameters
|
static int |
calculateWorkerIDInSS(int podIndex,
int containerIndex,
int workersPerPod)
calculate the workerID in the given StatefulSet
|
static int |
countWorkersUpToSS(JobAPI.Job job,
int currentStatefulSetIndex)
calculate the number of workers in the earlier statefulsets
|
static java.util.Map<java.lang.String,java.lang.Integer> |
generateAdditionalPorts(Config config,
int workerPort)
generate the additional requested ports for this worker
|
static int |
getAndInitRestartCount(Config cnfg,
java.lang.String jbID,
JobMasterAPI.WorkerInfo wInfo) |
static JobAPI.ComputeResource |
getComputeResource(JobAPI.Job job,
java.lang.String podName) |
static java.lang.String |
getJobMasterServiceIP(java.lang.String namespace,
java.lang.String jobID)
get job master service IP from job master service name
|
static java.lang.String |
getJobMasterServiceIPByPolling(java.lang.String namespace,
java.lang.String jobID,
long timeLimitMS)
get job master service IP from job master service name
poll repeatedly until getting it or times out
|
static JobMasterAPI.NodeInfo |
getNodeInfoFromEncodedStr(java.lang.String encodedNodeInfoList,
java.lang.String nodeIP) |
static void |
initLogger(Config cnfg,
java.lang.String entityName)
initialize the logger
entityName can be "jobMaster", "mpiMaster", etc.
|
static int |
initRestartFromCM(KubernetesController controller,
java.lang.String jbID,
java.lang.String keyName)
get restartCount from a ConfigMap in K8s master
if the worker/jm is starting for the first time,
we need to add the config restart count key to the config map
otherwise, we need to increase the restart count at the configmap
|
static int |
initRestartFromZK(Config cnfg,
java.lang.String jbID,
JobMasterAPI.WorkerInfo wInfo,
long jstTime)
worker is either starting for the first time, or it is coming from failure
We return restartCount.
|
static void |
initWorkerLogger(int workerID,
K8sPersistentVolume pv,
Config cnfg)
initialize the logger
|
static Config |
loadConfig(java.lang.String configDir)
load configuration files from the given directory
|
static void |
waitIndefinitely()
a test method to make the worker wait indefinitely
|
public static Config loadConfig(java.lang.String configDir)
public static void initWorkerLogger(int workerID, K8sPersistentVolume pv, Config cnfg)
public static void initLogger(Config cnfg, java.lang.String entityName)
public static JobAPI.ComputeResource getComputeResource(JobAPI.Job job, java.lang.String podName)
public static int calculateWorkerID(JobAPI.Job job, java.lang.String podName, java.lang.String containerName)
public static int countWorkersUpToSS(JobAPI.Job job, int currentStatefulSetIndex)
public static int calculateWorkerIDInSS(int podIndex, int containerIndex, int workersPerPod)
public static JobMasterAPI.NodeInfo getNodeInfoFromEncodedStr(java.lang.String encodedNodeInfoList, java.lang.String nodeIP)
public static java.lang.String getJobMasterServiceIP(java.lang.String namespace, java.lang.String jobID)
public static java.lang.String getJobMasterServiceIPByPolling(java.lang.String namespace, java.lang.String jobID, long timeLimitMS)
public static java.util.Map<java.lang.String,java.lang.Integer> generateAdditionalPorts(Config config, int workerPort)
public static void waitIndefinitely()
public static int getAndInitRestartCount(Config cnfg, java.lang.String jbID, JobMasterAPI.WorkerInfo wInfo)
public static int initRestartFromZK(Config cnfg, java.lang.String jbID, JobMasterAPI.WorkerInfo wInfo, long jstTime)
public static int initRestartFromCM(KubernetesController controller, java.lang.String jbID, java.lang.String keyName)
restartCount is returned as zero if the worker/jm is starting for the first time, if it is more than zero, the worker is restarting