-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathlauncher.py
393 lines (368 loc) · 16.7 KB
/
launcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
import os
import sys
import toml
import time
import ujson
import asyncio
import threading
from cachetools import cached, Cache
from kubernetes.client.rest import ApiException
from base_model.training_task import TrainingTask
from conf import CONF
from conf.flags import EXP_STATUS
from conf.flags import QUE_STATUS, TASK_TYPE
from db import MarsDB
from db import redis_conn
from k8s import K8sPreStopHook, get_corev1_api, get_appsv1_api
from k8s.v1_api import client
from k8s.v1_api import get_env_var
from logm import logger, log_stage
from k8s.async_v1_api import async_get_nodes_df
from roman_parliament import register_parliament, add_archive_trigger, archive_dict, add_archive_for_senators
from roman_parliament.archive_triggers.launcher_task_trigger import LauncherTaskTrigger
from server_model.auto_task_impl import AutoTaskSchemaWithDbImpl
from server_model.pod import Pod
from server_model.selector import TrainingTaskSelector, TrainImageSelector, UserSelector
register_parliament()
k8s_corev1_api = get_corev1_api()
k8s_appsv1_api = get_appsv1_api()
manager_mount_name = []
manager_host_path = []
manager_mount_path = []
manager_mount_ro = []
if manager_mounts := CONF.try_get('launcher.manager_mounts'):
for k, v in manager_mounts.items():
manager_mount_name.append(k)
manager_host_path.append(v.split(':')[0])
manager_mount_path.append(v.split(':')[1])
manager_mount_ro.append(v.split(':')[-1] == 'ro')
RETRY_TIMES = 2
module = os.environ.get('POD_NAME', 'launcher')
class DuplicatedPods(Exception):
"""重复插入了 pods"""
pass
def get_manager_resource(big):
return client.V1ResourceRequirements(limits={'cpu': 1, 'memory': '4000Mi' if big else '200Mi'},
requests={'cpu': 0, 'memory': '1000Mi' if big else '100Mi'})
# image info 不会变来变去的
@cached(cache=Cache(maxsize=1024))
def get_image_info(image_name):
# 在这里查询数据库,获取 image 的信息,这样的好处是,以后可以和议会的集成起来
return TrainImageSelector.find_one(os.path.basename(image_name))
nodes_dict = {}
lock = threading.Lock()
def start_get_nodes_df():
global nodes_dict
interval = 60 * 60
last_get = 0
loop = asyncio.new_event_loop()
while True:
try:
if time.time() - last_get > interval:
nodes_df = loop.run_until_complete(async_get_nodes_df())
_nodes_dict = {
node: {'flag': flag, 'schedule_zone': schedule_zone}
for node, flag, schedule_zone in zip(nodes_df.name, nodes_df.flag, nodes_df.schedule_zone)
}
if len(_nodes_dict) > 0:
lock.acquire()
nodes_dict = {**nodes_dict, **_nodes_dict}
lock.release()
last_get = time.time()
except Exception as e:
logger.error(e)
time.sleep(10)
@log_stage(module)
def insert_pods(task: TrainingTask):
if task.task_type != TASK_TYPE.VIRTUAL_TASK:
task_memory = task.config_json['assigned_resource']['memory']
task_cpu = task.config_json['assigned_resource']['cpu']
task_assigned_gpus = task.config_json['assigned_resource']['assigned_gpus']
try:
with MarsDB() as conn:
for i, node in enumerate(task.assigned_nodes):
Pod(
task_id=task.id, pod_id=f'{task.user_name.replace("_", "-")}-{task.id}-{i}', job_id=i,
xp_id=task.id,
status=EXP_STATUS.CREATED, node=node, role=['worker', 'master'][i == 0], memory=task_memory[i],
cpu=task_cpu[i], assigned_gpus=task_assigned_gpus[i]
).insert(db_conn=conn)
except Exception as e:
if 'duplicate' in str(e):
logger.exception(e)
raise DuplicatedPods('有 launcher 已经插入任务了')
raise Exception(f'task_id: {task.id} 插入 pod 失败, args: {(task_memory, task_cpu, task_assigned_gpus)} exception: {e}')
@log_stage(module)
def create_manager(task: TrainingTask, user_name):
task_id = task.id
namespace = task.user.config.task_namespace
# 先创建 configmap
manager_name = f'{user_name.replace("_", "-")}-{task_id}-manager'
lock.acquire()
try:
nodes_flags = [str(nodes_dict[n]['flag']) for n in task.assigned_nodes]
nodes_zones = [str(nodes_dict[n]['schedule_zone']) for n in task.assigned_nodes]
except Exception as e:
logger.error('没有正确获取到 nodes_df')
logger.error(e)
raise e
lock.release()
env = [
get_env_var(key='TASK_ID', value=task_id),
get_env_var(key='MANAGER_NAME', value=manager_name),
get_env_var(key='NAMESPACE', value=namespace),
get_env_var(key='TZ', value='Asia/Shanghai'),
get_env_var(key='MARSV2_SERVER', value=os.environ.get('MARSV2_SERVER', CONF.try_get('launcher.api_server'))),
get_env_var(key='DEBUG', value=os.environ.get('DEBUG', '0')),
get_env_var(key='MODULE_NAME', value='manager'),
get_env_var(key='MARSV2_SCHEDULE_ZONES', value=','.join(nodes_zones)),
get_env_var(key='MARSV2_NODE_FLAGS', value=','.join(nodes_flags)),
get_env_var(key='MARSV2_TASK_SIDECARS', value=ujson.dumps(task.schema.get('options', {}).get('sidecar', [])))
]
if 'CUSTOM_FILE_NAME' in os.environ:
env.append(get_env_var(key='CUSTOM_FILE_NAME', value=os.environ.get('CUSTOM_FILE_NAME', '')))
if manager_envs := CONF.try_get('launcher.manager_envs'):
for k, v in manager_envs.items():
env.append(get_env_var(k, value=v))
# hfai image
if task.backend.startswith('train_image:'):
train_image_info = get_image_info(task.backend[len('train_image:'):])
env.append(get_env_var(key='HFAI_IMAGE', value=train_image_info.image_url))
env.append(get_env_var(key='HFAI_IMAGE_WEKA_PATH', value=train_image_info.path))
manager_docker_image = os.environ.get('CURRENT_POD_IMAGE', CONF.try_get('launcher.manager_image'))
capabilities = client.V1Capabilities(add=['IPC_LOCK'])
security_context = client.V1SecurityContext(capabilities=capabilities)
volume_mounts = [
client.V1VolumeMount(
name=mount_name,
mount_path=mount_item,
read_only=ro)
for mount_name, mount_item, ro in zip(manager_mount_name, manager_mount_path, manager_mount_ro)
]
volumes = [
client.V1Volume(
name=mount_name,
host_path=client.V1HostPathVolumeSource(path=mount_item))
for mount_name, mount_item in zip(manager_mount_name, manager_host_path)
]
# 把manager和存放log的目录全部mount进去
volume_mounts += [
client.V1VolumeMount(
name='config-map',
mount_path='/etc/config'
),
client.V1VolumeMount(
name='log',
mount_path='/var/log/experiment-manager',
),
]
volumes += [
client.V1Volume(
name='config-map',
config_map=client.V1ConfigMapVolumeSource(
name=f'etc-configmap-{task_id}'
)
),
client.V1Volume(
name='log',
host_path=client.V1HostPathVolumeSource(
path=f'/var/log/experiment-manager/{user_name.replace("_", "-")}-{task_id}-manager-0',
type='DirectoryOrCreate'
)
),
]
if os.environ.get('server_path'): # 不使用镜像里的server
volume_mounts += [
client.V1VolumeMount(
name='server',
mount_path='/high-flyer/code/multi_gpu_runner_server',
read_only=True
)]
volumes += [
client.V1Volume(
name='server',
host_path=client.V1HostPathVolumeSource(
path=os.environ['server_path'],
type='Directory'
)
)]
containers = [client.V1Container(name=container.replace('_', '-'),
image=manager_docker_image,
image_pull_policy=CONF.try_get('launcher.image_pull_policy', default='IfNotPresent'),
security_context=security_context,
env=env,
volume_mounts=volume_mounts,
resources=get_manager_resource('init' not in container))
for container in ['init_manager', 'manager']]
init_command = ['/bin/bash', '-c']
init_args = ' && '.join([
"cd /high-flyer/code/multi_gpu_runner_server",
"PYTHONPATH=/high-flyer/code/multi_gpu_runner_server python -u experiment_manager/manager/init_manager.py"])
init_command.append(init_args)
containers[0].command = init_command
containers[1].command = ['supervisord', '-c', '/high-flyer/code/multi_gpu_runner_server/experiment_manager/supervisord.conf']
labels = {
'task_id': str(task_id),
'user_id': user_name,
'type': 'manager'
}
podspec = client.V1PodSpec(
init_containers=containers[0:1],
containers=containers[1:],
volumes=volumes,
service_account_name='default',
restart_policy='Always',
affinity=client.V1Affinity(
node_affinity=client.V1NodeAffinity(
required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
node_selector_terms=[client.V1NodeSelectorTerm(
match_expressions=[
client.V1NodeSelectorRequirement(
key='kubernetes.io/hostname',
operator='In',
values=CONF.launcher.manager_nodes
)
]
)]
)
)
))
metadata = client.V1ObjectMeta(name=manager_name, namespace=namespace, labels=labels)
podtemplatespec = client.V1PodTemplateSpec(metadata=metadata,
spec=podspec)
stspec = client.V1StatefulSetSpec(
replicas=1,
template=podtemplatespec,
selector=client.V1LabelSelector(match_labels=labels),
service_name=f'{user_name.replace("_", "-")}-{task_id}-manager'
)
st = client.V1StatefulSet(metadata=metadata, spec=stspec)
st_resp = k8s_appsv1_api.create_namespaced_stateful_set_with_retry(namespace=namespace, body=st)
# 接下来所有的资源 owner_ref 都指向 manager
owner_ref = client.V1OwnerReference(api_version='apps/v1', kind='StatefulSet', name=st_resp.metadata.name, uid=st_resp.metadata.uid, controller=False, block_owner_deletion=True)
# 创建 headless service
metadata = client.V1ObjectMeta(
name=f'{user_name.replace("_", "-")}-{task_id}-manager-0',
namespace=namespace,
owner_references=[owner_ref]
)
spec = client.V1ServiceSpec(selector={'statefulset.kubernetes.io/pod-name': f'{user_name.replace("_", "-")}-{task_id}-manager-0'}, cluster_ip='None')
service = client.V1Service(api_version='v1', kind='Service', metadata=metadata, spec=spec)
k8s_corev1_api.create_namespaced_service_with_retry(namespace=namespace, body=service)
# 创建任务所需的 configmap,实际上可以先创建 manager 再创建 manager 需要的 configmap,这样所有资源的 ref 都能指向 manager
k8s_corev1_api.create_namespaced_config_map_with_retry(
namespace=namespace,
body=client.V1ConfigMap(
immutable=True,
data={'override.toml': toml.dumps(CONF)},
metadata=client.V1ObjectMeta(
name=f'etc-configmap-{task_id}',
namespace=namespace,
owner_references=[owner_ref]
)
)
)
k8s_corev1_api.create_namespaced_config_map_with_retry(
namespace=namespace,
body=client.V1ConfigMap(
immutable=True,
data={
file: open(os.path.join('marsv2/scripts', file), 'r').read()
for file in os.listdir('marsv2/scripts') if os.path.isfile(os.path.join('marsv2/scripts', file))
},
metadata=client.V1ObjectMeta(
name=f'marsv2-scripts-{task_id}', # 这里不像别的资源一样,加上用户名,因为 storage 表不支持 replace 字符串
namespace=namespace,
owner_references=[owner_ref]
)
)
)
k8s_corev1_api.create_namespaced_config_map_with_retry(
namespace=namespace,
body=client.V1ConfigMap(
immutable=True,
data={
file: open(os.path.join('marsv2/entrypoints', file), 'r').read()
for file in os.listdir('marsv2/entrypoints') if os.path.isfile(os.path.join('marsv2/entrypoints', file))
},
metadata=client.V1ObjectMeta(
name=f'marsv2-entrypoints-{task_id}',
namespace=namespace,
owner_references=[owner_ref]
)
)
)
def manual_make_task_finished(task: TrainingTask):
for pod in task.pods:
pod.update(('status', ), (EXP_STATUS.STOPPED, ))
task.re_impl(AutoTaskSchemaWithDbImpl)
task.update(('queue_status',), (QUE_STATUS.FINISHED,))
@log_stage(module)
def start_exp(task: TrainingTask):
logger.info(f"收到消息,起 {task.job_info} 的节点")
# 对于validation任务,如果最开始的虚拟任务停止,则对应的所有validation任务停止
main_task = TrainingTaskSelector.find_one(None, chain_id=task.chain_id.split('_main')[0]) if task.task_type == TASK_TYPE.VALIDATION_TASK else task
ban_name = f'ban:{main_task.user_name}:{main_task.nb_name}:{main_task.chain_id}'
task.user = UserSelector.from_user_name(user_name=task.user_name)
if redis_conn.get(ban_name):
logger.info(f'{task.id}由于前置任务被停止,不启动')
manual_make_task_finished(task)
return
for t in range(RETRY_TIMES):
try:
insert_pods(task)
create_manager(task=task, user_name=task.user_name)
return
except ApiException as ae:
if ae.status == 409:
logger.info('已经存在这个任务的 manager 了,不用新建')
return
else:
logger.exception(ae)
logger.f_error(f'manager 第{t + 1}次失败了,请人工检查', task=task)
continue
except DuplicatedPods as dp:
raise dp
except Exception as e:
logger.exception(e)
logger.f_error(f'起 manager 失败, 错误编号为: {e}\n强制退出任务,请查看数据库', task=task)
manual_make_task_finished(task)
raise e
logger.f_error(f'起 manager {RETRY_TIMES}次都失败了, 强制退出任务,请查看数据库', task=task)
manual_make_task_finished(task)
if __name__ == '__main__':
with logger.contextualize(uuid=f'{module}.setup'):
logger.info(f'launcher python', sys.version)
logger.info('开始订阅...')
add_archive_trigger(LauncherTaskTrigger)
# 启动过的任务记录一下
started_archive_keys = set()
thrd = threading.Thread(target=start_get_nodes_df, daemon=True)
thrd.start()
while len(nodes_dict) == 0:
time.sleep(1)
logger.info('等待获取 nodes_df')
logger.info('开始接收任务启动消息...')
with logger.contextualize(uuid=f'{module}.loop'):
while True:
if K8sPreStopHook.receive_stop_pod():
MarsDB().dispose()
logger.warning('收到了 stop launcher 的指令,退出自己')
os.system("""ps -ef | grep -v PID | awk '{system("kill -KILL " $2)}'""")
archive_keys = set(archive_dict.keys())
started_archive_keys &= archive_keys
for archive_key in filter(lambda x: TrainingTask.__name__ in x, archive_keys - started_archive_keys):
if (task := archive_dict.get(archive_key, None)) is None:
continue
try:
start_exp(task)
add_archive_for_senators(trigger_name='TrainingTaskTrigger', data=[task.id])
except DuplicatedPods as de:
# 有别的 launcher 启动了这个任务,就不管了
logger.info('有其他 launcher 启动了这个任务, 跳过')
pass
except Exception as e:
logger.exception(e)
logger.f_error(f'起 manager 出现了异常:{str(e)}', task=task)
started_archive_keys.add(archive_key)
time.sleep(0.001)