azure-ray-template.json 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520
  1. {
  2. "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
  3. "contentVersion": "1.0.0.0",
  4. "parameters": {
  5. "adminUsername": {
  6. "type": "string",
  7. "defaultValue": "ubuntu",
  8. "metadata": {
  9. "description": "Username for the Virtual Machine."
  10. }
  11. },
  12. "publicKey": {
  13. "type": "securestring",
  14. "metadata": {
  15. "description": "SSH Key for the Virtual Machine"
  16. }
  17. },
  18. "adminPassword": {
  19. "type": "securestring",
  20. "metadata": {
  21. "description": "Password for the Virtual Machine and JupyterLab"
  22. }
  23. },
  24. "headNodeSize": {
  25. "type": "string",
  26. "defaultValue": "Standard_D2s_v3",
  27. "metadata": {
  28. "description": "The size of the head-node Virtual Machine"
  29. }
  30. },
  31. "headNodePriority": {
  32. "type": "string",
  33. "defaultValue": "Regular",
  34. "allowedValues": ["Regular", "Low", "Spot"],
  35. "metadata": {
  36. "description": "Use Azure Spot instance for worker nodes"
  37. }
  38. },
  39. "workerNodeSize": {
  40. "type": "string",
  41. "defaultValue": "Standard_D2s_v3",
  42. "metadata": {
  43. "description": "The size of the worker node Virtual Machine"
  44. }
  45. },
  46. "workerNodePriority": {
  47. "type": "string",
  48. "defaultValue": "Spot",
  49. "allowedValues": ["Regular", "Low", "Spot"],
  50. "metadata": {
  51. "description": "Use Azure Spot instance for worker nodes"
  52. }
  53. },
  54. "workerInitial": {
  55. "type": "int",
  56. "defaultValue": 1,
  57. "minValue": 0,
  58. "maxValue": 1000,
  59. "metadata": {
  60. "description": "Initial number of worker nodes"
  61. }
  62. },
  63. "workerMin": {
  64. "type": "int",
  65. "defaultValue": 1,
  66. "minValue": 0,
  67. "maxValue": 1000,
  68. "metadata": {
  69. "description": "Minimum number of worker nodes"
  70. }
  71. },
  72. "workerMax": {
  73. "type": "int",
  74. "defaultValue": 1,
  75. "minValue": 0,
  76. "maxValue": 1000,
  77. "metadata": {
  78. "description": "Maximum number of worker nodes"
  79. }
  80. },
  81. "condaEnv": {
  82. "type": "string",
  83. "defaultValue": "py38_tensorflow",
  84. "allowedValues": [
  85. "azureml_py36_automl",
  86. "azureml_py36_pytorch",
  87. "azureml_py36_tensorflow",
  88. "py38_default",
  89. "py38_pytorch",
  90. "py38_tensorflow"
  91. ],
  92. "metadata": {
  93. "description": "Conda environment to select (installed on DSVM)"
  94. }
  95. },
  96. "PythonPackages": {
  97. "type": "string",
  98. "defaultValue": "ray[rllib] gym[atari]",
  99. "metadata": {
  100. "description": "Python packages to install (space separated)"
  101. }
  102. },
  103. "PublicWebUI": {
  104. "type": "bool",
  105. "defaultValue": true,
  106. "metadata": {
  107. "description": "Open port for web UI"
  108. }
  109. }
  110. },
  111. "variables": {
  112. "azureScriptInitUrl": "https://raw.githubusercontent.com/ray-project/ray/master/doc/azure/azure-init.sh",
  113. "location": "[resourceGroup().location]",
  114. "vmName": "ray-node",
  115. "subnetWorkers": "10.32.0.0/16",
  116. "subnetHead": "10.33.0.0/16",
  117. "publicIpAddressName": "[concat(variables('vmName'), '-ip' )]",
  118. "networkIpConfig": "[guid(resourceGroup().id, variables('vmName'))]",
  119. "subnetName": "ray-subnet",
  120. "subnetHeadName": "ray-subnet-head",
  121. "subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vNetName'), variables('subnetName'))]",
  122. "subnetHeadRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vNetName'), variables('subnetHeadName'))]",
  123. "osDiskType": "Standard_LRS",
  124. "vmNameHead": "[concat(variables('vmName'), '-head')]",
  125. "vmNameWorker": "[concat(variables('vmName'), '-workers')]",
  126. "networkInterfaceName": "[concat(variables('vmName'), '-nic')]",
  127. "networkSecurityGroupName": "ray-nsg",
  128. "vNetName": "ray-vnet",
  129. "subnetNetwork": "[split(variables('subnetHead'), '/')[0]]",
  130. "headInternalIP": "[concat(substring(variables('subnetNetwork'), 0, lastIndexOf(variables('subnetNetwork'), '.')), '.5')]",
  131. "imagePublisher": "microsoft-dsvm",
  132. "imageOffer": "ubuntu-1804",
  133. "imageSku": "1804",
  134. "imageVersion": "latest"
  135. },
  136. "resources": [
  137. {
  138. "type": "Microsoft.Network/networkSecurityGroups",
  139. "apiVersion": "2019-02-01",
  140. "name": "[variables('networkSecurityGroupName')]",
  141. "location": "[variables('location')]",
  142. "properties": {
  143. "securityRules": [
  144. {
  145. "name": "SSH",
  146. "properties": {
  147. "priority": 1000,
  148. "protocol": "TCP",
  149. "access": "Allow",
  150. "direction": "Inbound",
  151. "sourceAddressPrefix": "*",
  152. "sourcePortRange": "*",
  153. "destinationAddressPrefix": "*",
  154. "destinationPortRange": "22"
  155. }
  156. },
  157. {
  158. "name": "JupyterLab",
  159. "properties": {
  160. "priority": 1001,
  161. "protocol": "TCP",
  162. "access": "Allow",
  163. "direction": "Inbound",
  164. "sourceAddressPrefix": "*",
  165. "sourcePortRange": "*",
  166. "destinationAddressPrefix": "*",
  167. "destinationPortRange": "8000"
  168. }
  169. },
  170. {
  171. "name": "RayWebUI",
  172. "properties": {
  173. "priority": 1002,
  174. "protocol": "TCP",
  175. "access": "[if(parameters('PublicWebUI'), 'Allow', 'Deny')]",
  176. "direction": "Inbound",
  177. "sourceAddressPrefix": "*",
  178. "sourcePortRange": "*",
  179. "destinationAddressPrefix": "*",
  180. "destinationPortRange": "8265"
  181. }
  182. },
  183. {
  184. "name": "TensorBoard",
  185. "properties": {
  186. "priority": 1003,
  187. "protocol": "TCP",
  188. "access": "[if(parameters('PublicWebUI'), 'Allow', 'Deny')]",
  189. "direction": "Inbound",
  190. "sourceAddressPrefix": "*",
  191. "sourcePortRange": "*",
  192. "destinationAddressPrefix": "*",
  193. "destinationPortRange": "6006"
  194. }
  195. }
  196. ]
  197. }
  198. },
  199. {
  200. "type": "Microsoft.Network/virtualNetworks",
  201. "apiVersion": "2019-11-01",
  202. "name": "[variables('vNetName')]",
  203. "location": "[variables('location')]",
  204. "properties": {
  205. "addressSpace": {
  206. "addressPrefixes": [
  207. "[variables('subnetHead')]",
  208. "[variables('subnetWorkers')]"
  209. ]
  210. },
  211. "subnets": [
  212. {
  213. "name": "[variables('subnetName')]",
  214. "properties": {
  215. "addressPrefix": "[variables('subnetWorkers')]"
  216. }
  217. },
  218. {
  219. "name": "[variables('subnetHeadName')]",
  220. "properties": {
  221. "addressPrefix": "[variables('subnetHead')]"
  222. }
  223. }
  224. ]
  225. }
  226. },
  227. {
  228. "type": "Microsoft.Network/publicIpAddresses",
  229. "apiVersion": "2019-02-01",
  230. "name": "[variables('publicIpAddressName')]",
  231. "location": "[variables('location')]",
  232. "properties": {
  233. "publicIpAllocationMethod": "Static",
  234. "publicIPAddressVersion": "IPv4"
  235. },
  236. "sku": {
  237. "name": "Basic",
  238. "tier": "Regional"
  239. }
  240. },
  241. {
  242. "type": "Microsoft.Network/networkInterfaces",
  243. "apiVersion": "2020-06-01",
  244. "name": "[variables('networkInterfaceName')]",
  245. "location": "[variables('location')]",
  246. "dependsOn": [
  247. "[resourceId('Microsoft.Network/publicIpAddresses', variables('publicIpAddressName'))]",
  248. "[resourceId('Microsoft.Network/networkSecurityGroups', variables('networkSecurityGroupName'))]"
  249. ],
  250. "properties": {
  251. "ipConfigurations": [
  252. {
  253. "name": "[variables('networkIpConfig')]",
  254. "properties": {
  255. "subnet": {
  256. "id": "[variables('subnetHeadRef')]"
  257. },
  258. "privateIPAllocationMethod": "Static",
  259. "privateIPAddress": "[variables('headInternalIP')]",
  260. "publicIpAddress": {
  261. "id": "[resourceId('Microsoft.Network/publicIPAddresses', variables('publicIPAddressName'))]"
  262. }
  263. }
  264. }
  265. ],
  266. "networkSecurityGroup": {
  267. "id": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('networkSecurityGroupName'))]"
  268. }
  269. }
  270. },
  271. {
  272. "type": "Microsoft.Compute/virtualMachines",
  273. "apiVersion": "2020-06-01",
  274. "name": "[variables('vmNameHead')]",
  275. "location": "[variables('location')]",
  276. "dependsOn": [
  277. "[resourceId('Microsoft.Network/networkInterfaces', variables('networkInterfaceName'))]"
  278. ],
  279. "properties": {
  280. "hardwareProfile": {
  281. "vmSize": "[parameters('headNodeSize')]"
  282. },
  283. "priority": "[parameters('headNodePriority')]",
  284. "storageProfile": {
  285. "osDisk": {
  286. "createOption": "fromImage",
  287. "managedDisk": {
  288. "storageAccountType": "[variables('osDiskType')]"
  289. }
  290. },
  291. "imageReference": {
  292. "publisher": "[variables('imagePublisher')]",
  293. "offer": "[variables('imageOffer')]",
  294. "sku": "[variables('imageSku')]",
  295. "version": "[variables('imageVersion')]"
  296. }
  297. },
  298. "networkProfile": {
  299. "networkInterfaces": [
  300. {
  301. "id": "[resourceId('Microsoft.Network/networkInterfaces', variables('networkInterfaceName'))]"
  302. }
  303. ]
  304. },
  305. "osProfile": {
  306. "computerName": "[variables('vmNameHead')]",
  307. "adminUsername": "[parameters('adminUsername')]",
  308. "adminPassword": "[parameters('adminPassword')]",
  309. "linuxConfiguration": {
  310. "disablePasswordAuthentication": false,
  311. "ssh": {
  312. "publicKeys": [
  313. {
  314. "path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]",
  315. "keyData": "[parameters('publicKey')]"
  316. }
  317. ]
  318. }
  319. }
  320. }
  321. },
  322. "resources": [
  323. {
  324. "type": "Microsoft.Compute/virtualMachines/extensions",
  325. "name": "[concat(variables('vmNameHead'), '/HeadNodeInitScript')]",
  326. "apiVersion": "2020-06-01",
  327. "location": "[variables('location')]",
  328. "dependsOn": [
  329. "[resourceId('Microsoft.Compute/virtualMachines', variables('vmNameHead'))]"
  330. ],
  331. "properties": {
  332. "publisher": "Microsoft.Azure.Extensions",
  333. "type": "CustomScript",
  334. "typeHandlerVersion": "2.1",
  335. "autoUpgradeMinorVersion": true,
  336. "settings": {
  337. "commandToExecute": "[concat('sh azure-init.sh ', parameters('adminUsername'), ' ', parameters('condaEnv'), ' \"', parameters('PythonPackages'), '\" ignore head 2>&1 >/var/log/ray-head.log')]",
  338. "fileUris": [
  339. "[variables('azureScriptInitUrl')]"
  340. ]
  341. }
  342. }
  343. }
  344. ]
  345. },
  346. {
  347. "type": "Microsoft.Compute/virtualMachineScaleSets",
  348. "name": "[variables('vmNameWorker')]",
  349. "location": "[variables('location')]",
  350. "apiVersion": "2019-07-01",
  351. "dependsOn": [
  352. "[resourceId('Microsoft.Network/virtualNetworks', variables('vNetName'))]"
  353. ],
  354. "sku": {
  355. "name": "[parameters('workerNodeSize')]",
  356. "tier": "Standard",
  357. "capacity": "[parameters('workerInitial')]"
  358. },
  359. "properties": {
  360. "upgradePolicy": {
  361. "mode": "Manual"
  362. },
  363. "virtualMachineProfile": {
  364. "priority": "[parameters('workerNodePriority')]",
  365. "storageProfile": {
  366. "osDisk": {
  367. "createOption": "fromImage",
  368. "managedDisk": {
  369. "storageAccountType": "[variables('osDiskType')]"
  370. }
  371. },
  372. "imageReference": {
  373. "publisher": "[variables('imagePublisher')]",
  374. "offer": "[variables('imageOffer')]",
  375. "sku": "[variables('imageSku')]",
  376. "version": "[variables('imageVersion')]"
  377. }
  378. },
  379. "osProfile": {
  380. "computerNamePrefix": "[variables('vmNameWorker')]",
  381. "adminUsername": "[parameters('adminUsername')]",
  382. "adminPassword": "[parameters('adminPassword')]",
  383. "linuxConfiguration": {
  384. "disablePasswordAuthentication": false,
  385. "ssh": {
  386. "publicKeys": [
  387. {
  388. "path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]",
  389. "keyData": "[parameters('publicKey')]"
  390. }
  391. ]
  392. }
  393. }
  394. },
  395. "networkProfile": {
  396. "networkInterfaceConfigurations": [
  397. {
  398. "name": "[concat(variables('vmNameWorker'),'-nic')]",
  399. "properties": {
  400. "primary": true,
  401. "ipConfigurations": [
  402. {
  403. "name": "worker-ip-config",
  404. "properties": {
  405. "subnet": {
  406. "id": "[variables('subnetRef')]"
  407. }
  408. }
  409. }
  410. ]
  411. }
  412. }
  413. ]
  414. },
  415. "extensionProfile": {
  416. "extensions": [
  417. {
  418. "name": "RayWorkerInitScript",
  419. "properties": {
  420. "publisher": "Microsoft.Azure.Extensions",
  421. "type": "CustomScript",
  422. "typeHandlerVersion": "2.1",
  423. "autoUpgradeMinorVersion": true,
  424. "settings": {
  425. "commandToExecute": "[concat('sh azure-init.sh ', parameters('adminUsername'), ' ', parameters('condaEnv'), ' \"', parameters('PythonPackages'), '\" ', variables('headInternalIP'), ' worker 2>&1 >/var/log/ray-worker.log')]",
  426. "fileUris": [
  427. "[variables('azureScriptInitUrl')]"
  428. ]
  429. }
  430. }
  431. }
  432. ]
  433. }
  434. }
  435. }
  436. },
  437. {
  438. "type": "Microsoft.Insights/autoscaleSettings",
  439. "apiVersion": "2015-04-01",
  440. "name": "cpuautoscale",
  441. "location": "[variables('location')]",
  442. "dependsOn": [
  443. "[resourceId('Microsoft.Compute/virtualMachineScaleSets', variables('vmNameWorker'))]"
  444. ],
  445. "properties": {
  446. "name": "cpuautoscale",
  447. "targetResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
  448. "enabled": true,
  449. "profiles": [
  450. {
  451. "name": "Profile1",
  452. "capacity": {
  453. "minimum": "[parameters('workerMin')]",
  454. "maximum": "[parameters('workerMax')]",
  455. "default": "[parameters('workerInitial')]"
  456. },
  457. "rules": [
  458. {
  459. "metricTrigger": {
  460. "metricName": "Percentage CPU",
  461. "metricResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
  462. "timeGrain": "PT1M",
  463. "statistic": "Average",
  464. "timeWindow": "PT10M",
  465. "timeAggregation": "Average",
  466. "operator": "GreaterThan",
  467. "threshold": 80
  468. },
  469. "scaleAction": {
  470. "direction": "Increase",
  471. "type": "ChangeCount",
  472. "value": "1",
  473. "cooldown": "PT5M"
  474. }
  475. },
  476. {
  477. "metricTrigger": {
  478. "metricName": "Percentage CPU",
  479. "metricResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
  480. "timeGrain": "PT1M",
  481. "statistic": "Average",
  482. "timeWindow": "PT30M",
  483. "timeAggregation": "Average",
  484. "operator": "LessThan",
  485. "threshold": 20
  486. },
  487. "scaleAction": {
  488. "direction": "Decrease",
  489. "type": "ChangeCount",
  490. "value": "1",
  491. "cooldown": "PT5M"
  492. }
  493. }
  494. ]
  495. }
  496. ]
  497. }
  498. }
  499. ],
  500. "outputs": {
  501. "JupyterLabURL": {
  502. "type": "string",
  503. "value": "[concat('https://', reference(variables('publicIpAddressName')).ipAddress, ':8000')]"
  504. },
  505. "SSH": {
  506. "type": "string",
  507. "value": "[concat('ssh -t -L 8265:localhost:8265 -L 8888:localhost:8888 ', parameters('adminUsername'),'@', reference(variables('publicIpAddressName')).ipAddress)]"
  508. },
  509. "RayWebUIURL": {
  510. "type": "string",
  511. "value": "[concat('http://', reference(variables('publicIpAddressName')).ipAddress, ':8265')]",
  512. "condition": "[parameters('PublicWebUI')]"
  513. },
  514. "TensorBoard": {
  515. "type": "string",
  516. "value": "[concat('http://', reference(variables('publicIpAddressName')).ipAddress, ':6006')]",
  517. "condition": "[parameters('PublicWebUI')]"
  518. }
  519. }
  520. }