ray_v1alpha1_rayservice_template.yaml 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. kind: ConfigMap
  2. apiVersion: v1
  3. metadata:
  4. name: locusttest-{cluster_id}
  5. data:
  6. locustfile.py: |
  7. {locustfile}
  8. ---
  9. kind: ConfigMap
  10. apiVersion: v1
  11. metadata:
  12. name: script-{cluster_id}
  13. data:
  14. solution.py: |
  15. {solution}
  16. ---
  17. kind: ConfigMap
  18. apiVersion: v1
  19. metadata:
  20. name: redis-config-{cluster_id}
  21. labels:
  22. app: redis
  23. data:
  24. redis.conf: |-
  25. dir /data
  26. port 6379
  27. bind 0.0.0.0
  28. appendonly yes
  29. protected-mode no
  30. requirepass 5241590000000000
  31. pidfile /data/redis-6379.pid
  32. ---
  33. apiVersion: v1
  34. kind: Service
  35. metadata:
  36. name: redis-{cluster_id}
  37. labels:
  38. app: redis
  39. spec:
  40. type: ClusterIP
  41. ports:
  42. - name: redis
  43. port: 6379
  44. selector:
  45. app: redis
  46. ---
  47. apiVersion: apps/v1
  48. kind: Deployment
  49. metadata:
  50. name: redis-{cluster_id}
  51. labels:
  52. app: redis
  53. spec:
  54. replicas: 1
  55. selector:
  56. matchLabels:
  57. app: redis
  58. template:
  59. metadata:
  60. labels:
  61. app: redis
  62. spec:
  63. containers:
  64. - name: redis
  65. image: redis:5.0.8
  66. command:
  67. - "sh"
  68. - "-c"
  69. - "redis-server /usr/local/etc/redis/redis.conf"
  70. ports:
  71. - containerPort: 6379
  72. volumeMounts:
  73. - name: config
  74. mountPath: /usr/local/etc/redis/redis.conf
  75. subPath: redis.conf
  76. volumes:
  77. - name: config
  78. configMap:
  79. name: redis-config-{cluster_id}
  80. ---
  81. apiVersion: ray.io/v1alpha1
  82. kind: RayService
  83. metadata:
  84. name: service-{cluster_id}
  85. annotations:
  86. ray.io/ft-enabled: "true"
  87. spec:
  88. serviceUnhealthySecondThreshold: 300
  89. deploymentUnhealthySecondThreshold: 300
  90. serveConfig:
  91. importPath: solution.serve_entrypoint
  92. runtimeEnv: |
  93. env_vars:
  94. PYTHONPATH: "/tmp/testing/"
  95. deployments:
  96. - name: a
  97. numReplicas: 6
  98. rayActorOptions:
  99. numCpus: 1
  100. - name: b
  101. numReplicas: 6
  102. rayActorOptions:
  103. numCpus: 1
  104. - name: c
  105. numReplicas: 6
  106. rayActorOptions:
  107. numCpus: 1
  108. - name: d
  109. numReplicas: 6
  110. rayActorOptions:
  111. numCpus: 1
  112. - name: e
  113. numReplicas: 6
  114. rayActorOptions:
  115. numCpus: 1
  116. - name: DAGDriver
  117. numReplicas: 6
  118. rayActorOptions:
  119. numCpus: 1
  120. rayClusterConfig:
  121. rayVersion: '3.0.0.dev0' # should match the Ray version in the image of the containers
  122. ######################headGroupSpecs#################################
  123. # head group template and specs, (perhaps 'group' is not needed in the name)
  124. headGroupSpec:
  125. # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
  126. serviceType: ClusterIP
  127. # the pod replicas in this group typed head (assuming there could be more than 1 in the future)
  128. replicas: 1
  129. # logical group name, for this called head-group, also can be functional
  130. # pod type head or worker
  131. # rayNodeType: head # Not needed since it is under the headgroup
  132. # the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ...
  133. rayStartParams:
  134. port: '6379' # should match container port named gcs-server
  135. object-store-memory: '100000000'
  136. dashboard-host: '0.0.0.0'
  137. num-cpus: '0' # can be auto-completed from the limits
  138. node-ip-address: $MY_POD_IP # auto-completed as the head pod IP
  139. block: 'true'
  140. #pod template
  141. template:
  142. metadata:
  143. labels:
  144. rayCluster: cluster-{cluster_id}
  145. rayNodeType: head # will be injected if missing, must be head or wroker
  146. groupName: headgroup # will be injected if missing
  147. # annotations for pod
  148. annotations:
  149. key: value
  150. spec:
  151. volumes:
  152. - name: script
  153. configMap:
  154. name: script-{cluster_id}
  155. - name: log-volume
  156. emptyDir: {{}}
  157. containers:
  158. - name: ray-head
  159. image: {ray_image}
  160. imagePullPolicy: Always
  161. env:
  162. - name: MY_POD_IP
  163. valueFrom:
  164. fieldRef:
  165. fieldPath: status.podIP
  166. - name: RAY_REDIS_ADDRESS
  167. value: redis-{cluster_id}:6379
  168. - name: RAY_gcs_rpc_server_reconnect_timeout_s
  169. value: "600"
  170. - name: SERVE_DEPLOYMENT_HANDLE_IS_SYNC
  171. value: "1"
  172. resources:
  173. limits:
  174. cpu: 2
  175. requests:
  176. cpu: 2
  177. ports:
  178. - containerPort: 6379
  179. name: gcs-server
  180. - containerPort: 8265 # Ray dashboard
  181. name: dashboard
  182. - containerPort: 10001
  183. name: client
  184. - containerPort: 8000
  185. name: serve
  186. volumeMounts:
  187. - name: script
  188. mountPath: /tmp/testing/solution.py
  189. subPath: solution.py
  190. - mountPath: /tmp/ray/
  191. name: log-volume
  192. workerGroupSpecs:
  193. # the pod replicas in this group typed worker
  194. - replicas: 12
  195. minReplicas: 12
  196. maxReplicas: 12
  197. # logical group name, for this called small-group, also can be functional
  198. groupName: small-group
  199. # if worker pods need to be added, we can simply increment the replicas
  200. # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
  201. # the operator will remove pods from the list until the number of replicas is satisfied
  202. # when a pod is confirmed to be deleted, its name will be removed from the list below
  203. #scaleStrategy:
  204. # workersToDelete:
  205. # - raycluster-complete-worker-small-group-bdtwh
  206. # - raycluster-complete-worker-small-group-hv457
  207. # - raycluster-complete-worker-small-group-k8tj7
  208. # the following params are used to complete the ray start: ray start --block --node-ip-address= ...
  209. rayStartParams:
  210. node-ip-address: $MY_POD_IP
  211. block: 'true'
  212. num-cpus: '4' # can be auto-completed from the limits
  213. #pod template
  214. template:
  215. metadata:
  216. labels:
  217. key: value
  218. rayCluster: cluster-{cluster_id}
  219. # annotations for pod
  220. annotations:
  221. key: value
  222. spec:
  223. initContainers:
  224. # the env var $RAY_IP is set by the operator if missing, with the value of the head service name
  225. - name: init-myservice
  226. image: busybox:1.28
  227. command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
  228. volumes:
  229. - name: script
  230. configMap:
  231. name: script-{cluster_id}
  232. - name: log-volume
  233. emptyDir: {{}}
  234. containers:
  235. - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
  236. image: {ray_image}
  237. imagePullPolicy: Always
  238. livenessProbe:
  239. initialDelaySeconds: 30
  240. periodSeconds: 5
  241. timeoutSeconds: 10
  242. readinessProbe:
  243. initialDelaySeconds: 30
  244. periodSeconds: 5
  245. timeoutSeconds: 10
  246. # environment variables to set in the container.Optional.
  247. # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
  248. env:
  249. - name: RAY_DISABLE_DOCKER_CPU_WARNING
  250. value: "1"
  251. - name: TYPE
  252. value: "worker"
  253. - name: CPU_REQUEST
  254. valueFrom:
  255. resourceFieldRef:
  256. containerName: machine-learning
  257. resource: requests.cpu
  258. - name: CPU_LIMITS
  259. valueFrom:
  260. resourceFieldRef:
  261. containerName: machine-learning
  262. resource: limits.cpu
  263. - name: MEMORY_LIMITS
  264. valueFrom:
  265. resourceFieldRef:
  266. containerName: machine-learning
  267. resource: limits.memory
  268. - name: MEMORY_REQUESTS
  269. valueFrom:
  270. resourceFieldRef:
  271. containerName: machine-learning
  272. resource: requests.memory
  273. - name: MY_POD_NAME
  274. valueFrom:
  275. fieldRef:
  276. fieldPath: metadata.name
  277. - name: MY_POD_IP
  278. valueFrom:
  279. fieldRef:
  280. fieldPath: status.podIP
  281. - name: RAY_gcs_rpc_server_reconnect_timeout_s
  282. value: "600"
  283. - name: RAY_gcs_server_request_timeout_seconds
  284. value: "5"
  285. - name: SERVE_DEPLOYMENT_HANDLE_IS_SYNC
  286. value: "1"
  287. ports:
  288. - containerPort: 80
  289. name: client
  290. lifecycle:
  291. preStop:
  292. exec:
  293. command: ["/bin/sh","-c","ray stop"]
  294. resources:
  295. limits:
  296. cpu: "2"
  297. requests:
  298. cpu: "2"
  299. volumeMounts:
  300. - name: script
  301. mountPath: /tmp/testing/solution.py
  302. subPath: solution.py
  303. - mountPath: /tmp/ray/
  304. name: log-volume