Coverage for gco/stacks/analytics_stack.py: 99%

157 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-15 15:07 +0000

1"""Analytics stack for GCO - optional ML/analytics environment. 

2 

3Instantiated only when ``analytics_environment.enabled=true`` in ``cdk.json``. 

4When the toggle is ``false`` (the default), ``app.py`` skips creating it so 

5``cdk synth`` emits no SageMaker, EMR Serverless, or Cognito resources. 

6 

7Resources (wired in this order): 

8 

91. ``_create_kms_key`` — ``Analytics_KMS_Key`` 

102. ``_create_vpc_and_endpoints`` — private VPC + endpoints 

113. ``_create_access_logs_bucket`` — S3 access-logs bucket 

124. ``_create_studio_only_bucket`` — ``Studio_Only_Bucket`` 

135. ``_create_studio_efs`` — ``Studio_EFS`` 

146. ``_create_execution_role_and_grants`` — ``SageMaker_Execution_Role`` 

157. ``_grant_sagemaker_role_on_cluster_shared_bucket`` — cross-region IAM grant 

168. ``_create_studio_domain`` — ``sagemaker.CfnDomain`` 

179. ``_create_emr_app`` — ``emrserverless.CfnApplication`` 

1810. ``_create_cognito_pool`` — Cognito pool + client + domain 

1911. ``_create_presigned_url_lambda`` — ``Presigned_URL_Lambda`` 

2012. ``_apply_nag_suppressions`` — analytics-branch nag dispatch 

21 

22The API Gateway ``/studio/*`` wiring that consumes this Lambda lives in 

23``gco/stacks/api_gateway_global_stack.py``. 

24""" 

25 

26from __future__ import annotations 

27 

28from typing import Any 

29 

30from aws_cdk import ( 

31 CfnOutput, 

32 Duration, 

33 RemovalPolicy, 

34 Stack, 

35) 

36from aws_cdk import aws_cognito as cognito 

37from aws_cdk import aws_ec2 as ec2 

38from aws_cdk import aws_efs as efs 

39from aws_cdk import aws_emrserverless as emrserverless 

40from aws_cdk import aws_iam as iam 

41from aws_cdk import aws_kms as kms 

42from aws_cdk import aws_lambda as lambda_ 

43from aws_cdk import aws_logs as logs 

44from aws_cdk import aws_s3 as s3 

45from aws_cdk import aws_sagemaker as sagemaker 

46from aws_cdk import custom_resources as cr 

47from constructs import Construct 

48 

49from gco.config.config_loader import ConfigLoader 

50from gco.stacks.constants import ( 

51 CLUSTER_SHARED_SSM_PARAMETER_PREFIX, 

52 COGNITO_DOMAIN_PREFIX_DEFAULT, 

53 EMR_SERVERLESS_RELEASE_LABEL, 

54 LAMBDA_PYTHON_RUNTIME, 

55 SAGEMAKER_ROLE_NAME_PREFIX, 

56) 

57from gco.stacks.nag_suppressions import apply_all_suppressions 

58 

59# <pyflowchart-code-diagram> BEGIN - auto-inserted, do not edit 

60# Flowchart(s) generated from this file: 

61# * ``GCOAnalyticsStack.__init__`` -> ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack___init__.html`` 

62# (PNG: ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack___init__.png``) 

63# * ``GCOAnalyticsStack._create_execution_role_and_grants`` -> ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack__create_execution_role_and_grants.html`` 

64# (PNG: ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack__create_execution_role_and_grants.png``) 

65# * ``GCOAnalyticsStack._create_studio_domain`` -> ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack__create_studio_domain.html`` 

66# (PNG: ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack__create_studio_domain.png``) 

67# Regenerate with ``python diagrams/code_diagrams/generate.py``. 

68# <pyflowchart-code-diagram> END 

69 

70 

71def _parse_removal(value: str) -> RemovalPolicy: 

72 """Map a cdk.json removal-policy string to ``aws_cdk.RemovalPolicy``. 

73 

74 Translates ``analytics_environment.{efs,cognito}.removal_policy`` into 

75 the matching enum member. Accepts ``"retain"`` / ``"destroy"`` 

76 (case-insensitive); raises ``ValueError`` on anything else. 

77 """ 

78 normalized = value.strip().lower() 

79 if normalized == "retain": 

80 return RemovalPolicy.RETAIN 

81 if normalized == "destroy": 

82 return RemovalPolicy.DESTROY 

83 raise ValueError( 

84 f"analytics_environment removal_policy must be 'retain' or 'destroy', got {value!r}" 

85 ) 

86 

87 

88class GCOAnalyticsStack(Stack): 

89 """Optional ML/analytics environment: SageMaker Studio, EMR Serverless, Cognito. 

90 

91 Only instantiated when ``analytics_environment.enabled=true``. Lives in 

92 the API gateway region so the presigned-URL Lambda can wire into the 

93 existing ``/studio/*`` routes on ``GCOApiGatewayGlobalStack`` without 

94 a cross-region hop. 

95 """ 

96 

97 def __init__( 

98 self, 

99 scope: Construct, 

100 construct_id: str, 

101 *, 

102 config: ConfigLoader, 

103 api_gateway_secret_arn: str | None = None, 

104 **kwargs: Any, 

105 ) -> None: 

106 super().__init__(scope, construct_id, **kwargs) 

107 

108 self.config = config 

109 # ``api_gateway_secret_arn`` is reserved for future auth wiring; 

110 # accepted now so the constructor signature is stable. 

111 self.api_gateway_secret_arn = api_gateway_secret_arn 

112 

113 cfg = config.get_analytics_config() 

114 self.hyperpod_enabled: bool = bool(cfg["hyperpod"]["enabled"]) 

115 self.canvas_enabled: bool = bool(cfg["canvas"]["enabled"]) 

116 self.efs_removal: RemovalPolicy = _parse_removal(cfg["efs"]["removal_policy"]) 

117 self.cognito_removal: RemovalPolicy = _parse_removal(cfg["cognito"]["removal_policy"]) 

118 self._cognito_domain_prefix_override: str | None = cfg["cognito"].get("domain_prefix") 

119 

120 # Wiring order is load-bearing — each helper consumes resources from 

121 # earlier helpers (EFS ARN → execution role → studio domain, etc.). 

122 self._create_kms_key() 

123 self._create_vpc_and_endpoints() 

124 self._create_access_logs_bucket() 

125 self._create_studio_only_bucket() 

126 self._create_studio_efs() 

127 self._create_execution_role_and_grants() 

128 self._grant_sagemaker_role_on_cluster_shared_bucket() 

129 self._create_studio_domain() 

130 self._create_emr_app() 

131 self._create_cognito_pool() 

132 self._create_presigned_url_lambda() 

133 self._apply_nag_suppressions() 

134 

135 # ================================================================== 

136 # KMS + VPC 

137 # ================================================================== 

138 

139 def _create_kms_key(self) -> None: 

140 """Create ``Analytics_KMS_Key`` with rotation + 7-day pending window. 

141 

142 Customer-managed so every analytics-owned bucket, the Studio EFS, 

143 and SageMaker-written artifacts share a single encryption boundary. 

144 ``removal_policy=DESTROY`` follows the iteration-loop posture 

145 — the 7-day pending window gives recovery headroom without retaining 

146 the key past a ``cdk destroy gco-analytics`` cycle. 

147 """ 

148 self.kms_key = kms.Key( 

149 self, 

150 "AnalyticsKmsKey", 

151 description="Analytics_KMS_Key - encrypts analytics S3 buckets, Studio EFS, SageMaker artifacts", 

152 enable_key_rotation=True, 

153 pending_window=Duration.days(7), 

154 removal_policy=RemovalPolicy.DESTROY, 

155 ) 

156 

157 # Grant encrypt/decrypt to service principals that need to operate 

158 # on analytics-owned resources encrypted by this key. 

159 service_principals = [ 

160 f"logs.{self.region}.amazonaws.com", 

161 "sagemaker.amazonaws.com", 

162 "s3.amazonaws.com", 

163 "elasticfilesystem.amazonaws.com", 

164 ] 

165 for principal in service_principals: 

166 self.kms_key.add_to_resource_policy( 

167 iam.PolicyStatement( 

168 sid=f"Allow{principal.split('.')[0].capitalize()}Encrypt", 

169 effect=iam.Effect.ALLOW, 

170 principals=[iam.ServicePrincipal(principal)], 

171 actions=[ 

172 "kms:Encrypt", 

173 "kms:Decrypt", 

174 "kms:ReEncrypt*", 

175 "kms:GenerateDataKey*", 

176 "kms:DescribeKey", 

177 ], 

178 resources=["*"], # key-policy scope — always the key itself 

179 ) 

180 ) 

181 

182 def _create_vpc_and_endpoints(self) -> None: 

183 """Create a private VPC plus every VPC endpoint Studio needs. 

184 

185 Notebooks never land on public subnets (the VPC has none). 

186 The nine interface endpoints plus the S3 gateway endpoint 

187 keep all Studio/EMR/EFS traffic on the private network. A NAT 

188 gateway provides internet egress so notebooks can pip install, 

189 git clone, and access external APIs (HuggingFace, PyPI, etc.). 

190 """ 

191 self.vpc = ec2.Vpc( 

192 self, 

193 "AnalyticsVpc", 

194 max_azs=2, 

195 nat_gateways=1, 

196 subnet_configuration=[ 

197 ec2.SubnetConfiguration( 

198 name="AnalyticsPrivate", 

199 subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, 

200 cidr_mask=24, 

201 ), 

202 ec2.SubnetConfiguration( 

203 name="AnalyticsPublic", 

204 subnet_type=ec2.SubnetType.PUBLIC, 

205 cidr_mask=28, 

206 ), 

207 ], 

208 ) 

209 

210 # Gateway endpoint for S3 — route tables are wired up automatically. 

211 self.vpc.add_gateway_endpoint( 

212 "S3GatewayEndpoint", 

213 service=ec2.GatewayVpcEndpointAwsService.S3, 

214 ) 

215 

216 # Interface endpoints — one per AWS service required by Studio. Each 

217 # lands in the VPC's private subnets using the default 

218 # VPC-endpoint security group. 

219 interface_services: dict[str, ec2.InterfaceVpcEndpointAwsService] = { 

220 "SagemakerApiEndpoint": ec2.InterfaceVpcEndpointAwsService.SAGEMAKER_API, 

221 "SagemakerRuntimeEndpoint": ec2.InterfaceVpcEndpointAwsService.SAGEMAKER_RUNTIME, 

222 "SagemakerStudioEndpoint": ec2.InterfaceVpcEndpointAwsService.SAGEMAKER_STUDIO, 

223 "SagemakerNotebookEndpoint": ec2.InterfaceVpcEndpointAwsService.SAGEMAKER_NOTEBOOK, 

224 "StsEndpoint": ec2.InterfaceVpcEndpointAwsService.STS, 

225 "CloudWatchLogsEndpoint": ec2.InterfaceVpcEndpointAwsService.CLOUDWATCH_LOGS, 

226 "EcrEndpoint": ec2.InterfaceVpcEndpointAwsService.ECR, 

227 "EcrDockerEndpoint": ec2.InterfaceVpcEndpointAwsService.ECR_DOCKER, 

228 "EfsEndpoint": ec2.InterfaceVpcEndpointAwsService.ELASTIC_FILESYSTEM, 

229 } 

230 for construct_id, service in interface_services.items(): 

231 self.vpc.add_interface_endpoint( 

232 construct_id, 

233 service=service, 

234 subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

235 ) 

236 

237 # ================================================================== 

238 # S3 buckets 

239 # ================================================================== 

240 

241 def _create_access_logs_bucket(self) -> None: 

242 """Create the dedicated access-logs bucket for ``Studio_Only_Bucket``. 

243 

244 Server-side encryption uses S3-managed keys (SSE-S3) because S3 

245 server-access-log delivery does not support KMS-encrypted destinations 

246 without additional log-delivery role plumbing — the standard pattern 

247 is SSE-S3 for the log sink plus KMS for the bucket it logs. The 

248 resulting ``AwsSolutions-S1`` nag finding for the log sink targeting 

249 itself is scoped on the bucket construct by 

250 ``add_storage_suppressions`` via the analytics nag branch. 

251 """ 

252 self.access_logs_bucket = s3.Bucket( 

253 self, 

254 "AnalyticsAccessLogsBucket", 

255 encryption=s3.BucketEncryption.S3_MANAGED, 

256 block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 

257 enforce_ssl=True, 

258 versioned=True, 

259 removal_policy=RemovalPolicy.DESTROY, 

260 auto_delete_objects=True, 

261 lifecycle_rules=[ 

262 s3.LifecycleRule( 

263 id="ExpireAccessLogs", 

264 enabled=True, 

265 expiration=Duration.days(90), 

266 ) 

267 ], 

268 ) 

269 

270 def _create_studio_only_bucket(self) -> None: 

271 """Create ``Studio_Only_Bucket`` for notebook-private scratch + outputs. 

272 

273 Named ``gco-analytics-studio-<account>-<region>`` so the cdk-nag 

274 deny-list assertion (``arn:aws:s3:::gco-analytics-studio-*``) stays 

275 stable. KMS-encrypted with ``self.kms_key``; every access path goes 

276 through the ``SageMaker_Execution_Role`` grant — no other principal 

277 is granted access. 

278 """ 

279 self.studio_only_bucket = s3.Bucket( 

280 self, 

281 "StudioOnlyBucket", 

282 bucket_name=f"gco-analytics-studio-{self.account}-{self.region}", 

283 encryption=s3.BucketEncryption.KMS, 

284 encryption_key=self.kms_key, 

285 bucket_key_enabled=True, 

286 block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 

287 enforce_ssl=True, 

288 versioned=True, 

289 removal_policy=RemovalPolicy.DESTROY, 

290 auto_delete_objects=True, 

291 server_access_logs_bucket=self.access_logs_bucket, 

292 server_access_logs_prefix="studio-only/", 

293 ) 

294 

295 # Belt-and-suspenders Deny for insecure transport, duplicating the 

296 # ``enforce_ssl=True`` semantics with a verifiable SID in the 

297 # synthesized template (mirrors the ``DenyInsecureTransport`` pattern 

298 # used by ``Cluster_Shared_Bucket`` in ``GCOGlobalStack``). 

299 self.studio_only_bucket.add_to_resource_policy( 

300 iam.PolicyStatement( 

301 sid="DenyInsecureTransport", 

302 effect=iam.Effect.DENY, 

303 principals=[iam.AnyPrincipal()], 

304 actions=["s3:*"], 

305 resources=[ 

306 self.studio_only_bucket.bucket_arn, 

307 f"{self.studio_only_bucket.bucket_arn}/*", 

308 ], 

309 conditions={"Bool": {"aws:SecureTransport": "false"}}, 

310 ) 

311 ) 

312 

313 # ================================================================== 

314 # Studio EFS 

315 # ================================================================== 

316 

317 def _create_studio_efs(self) -> None: 

318 """Create ``Studio_EFS`` with KMS encryption + TLS in transit. 

319 

320 Per-user access points are created lazily by the presigned-URL 

321 Lambda on first profile creation. No access points are defined 

322 here, so the file system's ``/`` root is effectively inaccessible 

323 until the Lambda materializes a per-user AP. 

324 

325 The dedicated security group only allows the VPC's private 

326 CIDR on TCP/2049 (NFS). SageMaker Studio mount traffic originates 

327 from the Studio compute subnet, which shares the VPC with this EFS. 

328 """ 

329 self.studio_efs_security_group = ec2.SecurityGroup( 

330 self, 

331 "StudioEfsSecurityGroup", 

332 vpc=self.vpc, 

333 description="SG for Studio_EFS - allows NFS from the analytics VPC only", 

334 allow_all_outbound=False, 

335 ) 

336 self.studio_efs_security_group.add_ingress_rule( 

337 peer=ec2.Peer.ipv4(self.vpc.vpc_cidr_block), 

338 connection=ec2.Port.tcp(2049), 

339 description="NFS from analytics VPC private subnets", 

340 ) 

341 

342 self.studio_efs = efs.FileSystem( 

343 self, 

344 "StudioEfs", 

345 vpc=self.vpc, 

346 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

347 encrypted=True, 

348 kms_key=self.kms_key, 

349 enable_automatic_backups=True, 

350 removal_policy=self.efs_removal, 

351 security_group=self.studio_efs_security_group, 

352 ) 

353 

354 # ================================================================== 

355 # SageMaker execution role + grants 

356 # ================================================================== 

357 

358 def _create_execution_role_and_grants(self) -> None: 

359 """Create ``SageMaker_Execution_Role`` and attach its (non-cluster-shared) grants. 

360 

361 Role name begins with ``AmazonSageMaker`` — SageMaker 

362 requires this prefix for any role used by a Studio domain. Grants 

363 attached here: 

364 

365 * RW on ``Studio_Only_Bucket`` + KMS on ``Analytics_KMS_Key`` 

366 * Read-only ``execute-api:Invoke`` on GCO API Gateway ``/api/v1/*`` GET routes 

367 * ``sqs:SendMessage`` on regional job queues (wildcard ARN pattern) 

368 * ``ssm:GetParameter`` on the ``Cluster_Shared_Bucket`` metadata 

369 parameters in the global region — lets notebooks look up the 

370 bucket name/arn/region at runtime without a per-user export step 

371 * EFS mount actions on ``Studio_EFS`` (specific AP arn is added by 

372 the presigned-URL Lambda at runtime; the role-level grant here is 

373 scoped to the EFS ARN) 

374 * HyperPod training-job actions when ``hyperpod.enabled=true`` 

375 * AWS-managed ``AmazonSageMakerCanvasFullAccess`` when 

376 ``canvas.enabled=true`` (opt-in no-code ML app) 

377 * AWS-managed ``AmazonSageMakerFullAccess`` — always attached 

378 whenever analytics is enabled. Covers the full SageMaker 

379 control-plane surface including MLflow Apps 

380 (``CreateMlflowApp``/``ListMlflowApps``/``DescribeMlflowApp``), 

381 MLflow Tracking Servers, Model Registry, Studio space/app 

382 lifecycle, and adjacent services (S3, ECR, CloudWatch Logs, 

383 etc.) that SageMaker needs to launch training jobs, create 

384 apps, and render the Studio IDE. We pair the managed policy 

385 with an inline ``sagemaker-mlflow:*`` statement (next block) 

386 because the managed policy does not cover the 

387 ``sagemaker-mlflow`` data-plane namespace the MLflow SDK 

388 talks to. MLflow does not have its own sub-toggle — the 

389 managed policy replaces our previous enumerated 

390 ``sagemaker:*MlflowTrackingServer*`` inline grant. 

391 

392 The ``Cluster_Shared_Bucket`` grant lives in its own helper 

393 (:meth:`_grant_sagemaker_role_on_cluster_shared_bucket`) because the 

394 bucket ARN is resolved via a cross-region SSM read. 

395 """ 

396 self.sagemaker_execution_role = iam.Role( 

397 self, 

398 "SagemakerExecutionRole", 

399 role_name=f"{SAGEMAKER_ROLE_NAME_PREFIX}-gco-analytics-exec-{self.region}", 

400 assumed_by=iam.ServicePrincipal("sagemaker.amazonaws.com"), 

401 description=( 

402 "SageMaker_Execution_Role - assumed by notebooks in the Studio " 

403 "domain. Grants RW on Studio_Only_Bucket and (via a separate " 

404 "cross-region policy) Cluster_Shared_Bucket, plus read-only GCO " 

405 "API access, SQS job submission, and cross-region ssm:GetParameter " 

406 "on the Cluster_Shared_Bucket metadata parameters." 

407 ), 

408 ) 

409 

410 # Bucket + KMS grants — studio-only scratch space. Analytics_KMS_Key 

411 # already has encrypt/decrypt in its key policy for the sagemaker 

412 # service principal, but role-level grants are still required for 

413 # IAM-side authorization per the double-auth model. 

414 self.studio_only_bucket.grant_read_write(self.sagemaker_execution_role) 

415 self.kms_key.grant_encrypt_decrypt(self.sagemaker_execution_role) 

416 

417 # SageMaker needs CreateGrant on the KMS key to delegate encryption 

418 # to EBS when creating space volumes. The grant is scoped to the 

419 # key and conditioned on the grantee being an AWS service. 

420 self.kms_key.grant( 

421 self.sagemaker_execution_role, 

422 "kms:CreateGrant", 

423 "kms:DescribeKey", 

424 ) 

425 

426 # GCO API scope — notebooks need both read-only GET operations 

427 # (list jobs, describe endpoints, fetch health) and job/inference 

428 # submission actions (POST manifests, PUT template updates, DELETE 

429 # jobs). Grant the full ``/api/v1/*`` method surface instead of 

430 # GET-only so users can submit new jobs, manage templates, and 

431 # tear things down from inside a notebook without bouncing 

432 # through a service account. 

433 # 

434 # The exact API id is not known here (it lives in the api-gateway 

435 # stack and is discovered through SSM or CfnOutput at synth time 

436 # — see the api_gateway_global_stack wiring). Scope to the 

437 # api-gateway region with any REST API id for now; tighter scope 

438 # is applied once ``AnalyticsApiConfig`` is wired in. 

439 api_gw_region = self.config.get_api_gateway_region() 

440 self.sagemaker_execution_role.add_to_policy( 

441 iam.PolicyStatement( 

442 effect=iam.Effect.ALLOW, 

443 actions=["execute-api:Invoke"], 

444 resources=[ 

445 # ``*/prod/*/api/v1/*`` — any API id, any HTTP method 

446 # (GET/POST/PUT/DELETE/PATCH), any path below 

447 # /api/v1/. /studio/* is explicitly excluded; Canvas 

448 # users go through their own Cognito-authorized 

449 # ``/studio/login`` route. 

450 f"arn:aws:execute-api:{api_gw_region}:{self.account}:*/prod/*/api/v1/*", 

451 # ``/inference/*`` proxies through to regional ALBs 

452 # for in-cluster model endpoints — notebooks need 

453 # the full method surface here too. 

454 f"arn:aws:execute-api:{api_gw_region}:{self.account}:*/prod/*/inference/*", 

455 ], 

456 ) 

457 ) 

458 

459 # SQS job submission — scoped to the regional queue name pattern 

460 # ``<project>-jobs-<region>`` written by 

461 # ``GCORegionalStack._create_sqs_queue``. The exact region isn't 

462 # known at synth time (queues live in regional stacks), so we use 

463 # ``*`` in the region component with the project name fixed. 

464 project_name = self.config.get_project_name() 

465 self.sagemaker_execution_role.add_to_policy( 

466 iam.PolicyStatement( 

467 effect=iam.Effect.ALLOW, 

468 actions=["sqs:SendMessage"], 

469 resources=[ 

470 f"arn:aws:sqs:*:{self.account}:{project_name}-jobs-*", 

471 ], 

472 ) 

473 ) 

474 

475 # ssm:GetParameter on the Cluster_Shared_Bucket metadata params. The 

476 # three parameters (name/arn/region) live in the global region where 

477 # GCOGlobalStack is deployed, not in the analytics region. Scoping 

478 # to the CLUSTER_SHARED_SSM_PARAMETER_PREFIX tree under the global 

479 # region means a notebook can fetch the bucket name at runtime via 

480 # boto3.client('ssm', region_name='<global-region>').get_parameter( 

481 # Name='/gco/cluster-shared-bucket/name')['Parameter']['Value'] 

482 # without any JupyterLab-terminal export step. 

483 global_region = self.config.get_global_region() 

484 self.sagemaker_execution_role.add_to_policy( 

485 iam.PolicyStatement( 

486 effect=iam.Effect.ALLOW, 

487 actions=["ssm:GetParameter", "ssm:GetParameters"], 

488 resources=[ 

489 f"arn:aws:ssm:{global_region}:{self.account}:parameter{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/*", 

490 ], 

491 ) 

492 ) 

493 

494 # EFS mount actions — scoped to the Studio EFS file-system ARN. 

495 self.sagemaker_execution_role.add_to_policy( 

496 iam.PolicyStatement( 

497 effect=iam.Effect.ALLOW, 

498 actions=[ 

499 "elasticfilesystem:ClientMount", 

500 "elasticfilesystem:ClientWrite", 

501 "elasticfilesystem:ClientRootAccess", 

502 ], 

503 resources=[self.studio_efs.file_system_arn], 

504 ) 

505 ) 

506 

507 # DescribeMountTargets does not support resource-level scoping — 

508 # SageMaker calls it during user profile provisioning to validate 

509 # the EFS mount configuration. 

510 self.sagemaker_execution_role.add_to_policy( 

511 iam.PolicyStatement( 

512 effect=iam.Effect.ALLOW, 

513 actions=[ 

514 "elasticfilesystem:DescribeMountTargets", 

515 "elasticfilesystem:DescribeFileSystems", 

516 ], 

517 resources=["*"], 

518 ) 

519 ) 

520 

521 # SageMaker Studio UI actions — the execution role is assumed by 

522 # the Studio notebook runtime and needs these to render the IDE, 

523 # list spaces/apps, and manage its own lifecycle. 

524 self.sagemaker_execution_role.add_to_policy( 

525 iam.PolicyStatement( 

526 effect=iam.Effect.ALLOW, 

527 actions=[ 

528 "sagemaker:DescribeDomain", 

529 "sagemaker:DescribeUserProfile", 

530 "sagemaker:CreatePresignedDomainUrl", 

531 "sagemaker:ListSpaces", 

532 "sagemaker:ListApps", 

533 "sagemaker:DescribeApp", 

534 "sagemaker:DescribeSpace", 

535 "sagemaker:CreateApp", 

536 "sagemaker:DeleteApp", 

537 "sagemaker:CreateSpace", 

538 "sagemaker:DeleteSpace", 

539 "sagemaker:UpdateSpace", 

540 "sagemaker:ListTags", 

541 "sagemaker:AddTags", 

542 ], 

543 resources=[ 

544 f"arn:aws:sagemaker:{self.region}:{self.account}:domain/*", 

545 f"arn:aws:sagemaker:{self.region}:{self.account}:user-profile/*/*", 

546 f"arn:aws:sagemaker:{self.region}:{self.account}:space/*/*", 

547 f"arn:aws:sagemaker:{self.region}:{self.account}:app/*/*/*/*", 

548 ], 

549 ) 

550 ) 

551 

552 # EMR Serverless — allow the execution role to discover, connect to, 

553 # and manage the EMR Serverless application from Studio's Data panel. 

554 self.sagemaker_execution_role.add_to_policy( 

555 iam.PolicyStatement( 

556 effect=iam.Effect.ALLOW, 

557 actions=[ 

558 "emr-serverless:ListApplications", 

559 "emr-serverless:GetApplication", 

560 "emr-serverless:CreateApplication", 

561 "emr-serverless:StartApplication", 

562 "emr-serverless:StopApplication", 

563 "emr-serverless:StartJobRun", 

564 "emr-serverless:GetJobRun", 

565 "emr-serverless:ListJobRuns", 

566 "emr-serverless:CancelJobRun", 

567 "emr-serverless:GetDashboardForJobRun", 

568 "emr-serverless:AccessLivyEndpoints", 

569 ], 

570 resources=["*"], 

571 ) 

572 ) 

573 

574 # SageMaker-managed MLflow + Model Registry + MLflow Apps. 

575 # 

576 # We attach the AWS-managed ``AmazonSageMakerFullAccess`` policy 

577 # for two reasons: 

578 # 

579 # 1. MLflow Apps (the newer Studio panel, separate from MLflow 

580 # Tracking Servers) requires ``sagemaker:CreateMlflowApp``/ 

581 # ``ListMlflowApps``/``DescribeMlflowApp`` etc. The action 

582 # surface is evolving quickly and the managed policy tracks 

583 # it. Enumerating it inline would drift. 

584 # 2. SageMaker Model Registry (``sagemaker:*ModelPackage*``), 

585 # Studio space/app lifecycle, training-job submission, and 

586 # the "related-services" helpers (S3, ECR, CloudWatch Logs) 

587 # are already covered by the managed policy — keeping them 

588 # inline duplicated the managed policy and kept us in a 

589 # catch-up loop whenever SageMaker shipped a new feature. 

590 # 

591 # The managed policy is ``Resource: *`` by design; the trade-off 

592 # (broader-than-least-privilege inside the role) is 

593 # acknowledged with a nag suppression below. The inline 

594 # ``sagemaker-mlflow:*`` statement that follows is still 

595 # required because the managed policy does NOT cover the 

596 # ``sagemaker-mlflow`` data-plane namespace — that's what the 

597 # MLflow SDK talks to over SigV4 for ``log_metric``, 

598 # ``log_artifact``, ``register_model``, etc. 

599 from gco.stacks.nag_suppressions import suppress_managed_policy_opt_in 

600 

601 self.sagemaker_execution_role.add_managed_policy( 

602 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonSageMakerFullAccess") 

603 ) 

604 suppress_managed_policy_opt_in( 

605 self.sagemaker_execution_role, 

606 managed_policy_name="AmazonSageMakerFullAccess", 

607 reason=( 

608 "AmazonSageMakerFullAccess is attached to " 

609 "SageMaker_Execution_Role when analytics_environment.enabled=true. " 

610 "The managed policy covers MLflow Apps, MLflow Tracking " 

611 "Servers, SageMaker Model Registry, Studio space/app " 

612 "lifecycle, training-job submission, and the cross-service " 

613 "helpers (S3, ECR, CloudWatch Logs) SageMaker needs to " 

614 "render the IDE and run jobs. Enumerating this surface " 

615 "inline drifts out of date within weeks — tracking the " 

616 "AWS-managed policy is the supported path. The inline " 

617 "``sagemaker-mlflow:*`` statement that follows covers " 

618 "the data-plane namespace the managed policy does not " 

619 "include. Users who want a locked-down alternative can " 

620 "disable the analytics environment." 

621 ), 

622 ) 

623 

624 # MLflow SDK data-plane (``sagemaker-mlflow:*``) — required for 

625 # ``mlflow.log_metric``, ``mlflow.log_artifact``, 

626 # ``mlflow.register_model``, etc. to round-trip through the 

627 # SageMaker-managed tracking server over SigV4. The managed 

628 # policy above covers the ``sagemaker:*`` control-plane 

629 # namespace but not ``sagemaker-mlflow:*`` (a separate service 

630 # prefix), so we keep this inline and scope it to the 

631 # api-gateway region where the tracking server and MLflow apps 

632 # live. 

633 self.sagemaker_execution_role.add_to_policy( 

634 iam.PolicyStatement( 

635 effect=iam.Effect.ALLOW, 

636 actions=["sagemaker-mlflow:*"], 

637 resources=[ 

638 f"arn:aws:sagemaker:{api_gw_region}:{self.account}:mlflow-tracking-server/*", 

639 f"arn:aws:sagemaker:{api_gw_region}:{self.account}:mlflow-app/*", 

640 ], 

641 ) 

642 ) 

643 

644 # MLflow's SigV4 plug-in exchanges STS ``GetCallerIdentity`` on 

645 # every request — the execution role needs that on ``*``. 

646 # ``sts:GetCallerIdentity`` does not support resource-level 

647 # scoping, so Resource: * is the only valid value. 

648 self.sagemaker_execution_role.add_to_policy( 

649 iam.PolicyStatement( 

650 effect=iam.Effect.ALLOW, 

651 actions=["sts:GetCallerIdentity"], 

652 resources=["*"], 

653 ) 

654 ) 

655 

656 # HyperPod sub-toggle — additional SageMaker actions for training-job 

657 # submission and cluster-instance lifecycle management. 

658 # ``resources=["*"]`` is the documented scope; the HyperPod actions 

659 # themselves encode the per-training-job authorization model. 

660 if self.hyperpod_enabled: 

661 self.sagemaker_execution_role.add_to_policy( 

662 iam.PolicyStatement( 

663 effect=iam.Effect.ALLOW, 

664 actions=[ 

665 "sagemaker:CreateTrainingJob", 

666 "sagemaker:DescribeTrainingJob", 

667 "sagemaker:StopTrainingJob", 

668 "sagemaker:ClusterInstance", 

669 "sagemaker:ClusterInstanceGroup", 

670 "sagemaker:DescribeClusterNode", 

671 "sagemaker:ListClusterNodes", 

672 ], 

673 resources=["*"], 

674 ) 

675 ) 

676 

677 # Canvas sub-toggle — attach AWS-managed ``AmazonSageMakerCanvasFullAccess`` 

678 # to the execution role so users can launch the Canvas no-code ML 

679 # app from inside Studio. The managed policy is used deliberately 

680 # (rather than enumerating each action) because Canvas's per-feature 

681 # permission surface — Bedrock for generative AI, Forecast for time 

682 # series, Rekognition for image classification, S3 writes for 

683 # datasets, Athena for SQL sources, etc. — is large and evolves with 

684 # every Canvas release. Tracking AWS's managed policy means we pick 

685 # up new Canvas capabilities automatically without shipping a CDK 

686 # change. The trade-off (broader-than-least-privilege inside the 

687 # role) is acknowledged with a dedicated nag suppression below. 

688 # 

689 # The matching ``CanvasAppSettings`` override on the Studio domain 

690 # lives in ``_create_studio_domain`` so the Canvas tile shows up 

691 # on the Studio landing page when the toggle is on. 

692 if self.canvas_enabled: 

693 self.sagemaker_execution_role.add_managed_policy( 

694 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonSageMakerCanvasFullAccess") 

695 ) 

696 

697 suppress_managed_policy_opt_in( 

698 self.sagemaker_execution_role, 

699 managed_policy_name="AmazonSageMakerCanvasFullAccess", 

700 reason=( 

701 "AmazonSageMakerCanvasFullAccess is attached to " 

702 "SageMaker_Execution_Role when analytics_environment.canvas.enabled=true. " 

703 "Canvas is an opt-in sub-toggle (off by default) and its managed " 

704 "policy is preferred over an enumerated least-privilege policy " 

705 "because Canvas's cross-service permission surface (Bedrock, " 

706 "Forecast, Rekognition, Athena, S3 dataset writes, etc.) evolves " 

707 "with every Canvas release — tracking the managed policy keeps " 

708 "Canvas functional as AWS ships new features. Users who want a " 

709 "locked-down alternative can keep the toggle off." 

710 ), 

711 ) 

712 

713 # EFS resource policy — must include DescribeMountTargets without 

714 # the AccessedViaMountTarget condition because SageMaker calls it 

715 # during user-profile provisioning. Using AnyPrincipal (AWS:*) 

716 # ensures all account roles (execution role, cleanup Lambda, and 

717 # the SageMaker service) are covered. Security is enforced by the 

718 # VPC security group (NFS traffic only from within the VPC) and 

719 # IAM policies on each role — the resource policy is permissive 

720 # by design to avoid the intersection-model blocking control-plane 

721 # calls. 

722 # Note: DescribeAccessPoints/DescribeFileSystems CANNOT be in EFS 

723 # resource policies (EFS rejects them). Those rely on IAM only. 

724 self.studio_efs.add_to_resource_policy( 

725 iam.PolicyStatement( 

726 effect=iam.Effect.ALLOW, 

727 principals=[iam.AnyPrincipal()], 

728 actions=[ 

729 "elasticfilesystem:ClientMount", 

730 "elasticfilesystem:ClientWrite", 

731 "elasticfilesystem:ClientRootAccess", 

732 "elasticfilesystem:DescribeMountTargets", 

733 "elasticfilesystem:DescribeFileSystems", 

734 "elasticfilesystem:DeleteAccessPoint", 

735 "elasticfilesystem:DeleteMountTarget", 

736 "elasticfilesystem:DeleteFileSystem", 

737 "elasticfilesystem:DeleteFileSystemPolicy", 

738 ], 

739 ) 

740 ) 

741 

742 def _grant_sagemaker_role_on_cluster_shared_bucket(self) -> None: 

743 """Attach RW + KMS on ``Cluster_Shared_Bucket`` to ``SageMaker_Execution_Role``. 

744 

745 The bucket lives in ``GCOGlobalStack`` in the global region. Its 

746 ARN is resolved at synth time via an ``AwsCustomResource`` that 

747 issues ``ssm:GetParameter`` against the global region — mirroring 

748 the pattern used by ``GCORegionalStack._resolve_cluster_shared_bucket_from_ssm``. 

749 

750 Two statements attach to the role: 

751 

752 1. S3: ``GetObject``/``PutObject``/``DeleteObject``/``ListBucket``/ 

753 ``GetBucketLocation`` on ``<arn>`` + ``<arn>/*``. 

754 2. KMS: ``Decrypt``/``GenerateDataKey`` with a 

755 ``kms:ViaService=s3.<global-region>.amazonaws.com`` condition. 

756 

757 This is a role-side policy — the bucket policy is owned 

758 exclusively by ``GCOGlobalStack``. 

759 """ 

760 from cdk_nag import NagSuppressions 

761 

762 global_region = self.config.get_global_region() 

763 parameter_name = f"{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/arn" 

764 

765 read_cr = cr.AwsCustomResource( 

766 self, 

767 "ReadClusterSharedBucketArn", 

768 on_create=cr.AwsSdkCall( 

769 service="SSM", 

770 action="getParameter", 

771 parameters={"Name": parameter_name}, 

772 region=global_region, 

773 physical_resource_id=cr.PhysicalResourceId.of("analytics-cluster-shared-arn"), 

774 ), 

775 on_update=cr.AwsSdkCall( 

776 service="SSM", 

777 action="getParameter", 

778 parameters={"Name": parameter_name}, 

779 region=global_region, 

780 physical_resource_id=cr.PhysicalResourceId.of("analytics-cluster-shared-arn"), 

781 ), 

782 policy=cr.AwsCustomResourcePolicy.from_sdk_calls( 

783 resources=cr.AwsCustomResourcePolicy.ANY_RESOURCE 

784 ), 

785 ) 

786 

787 # Scoped suppression: same shape as 

788 # ``GCORegionalStack._resolve_cluster_shared_bucket_from_ssm``. The 

789 # CR policy is ``Resource::*`` because cross-region SSM does not 

790 # support resource-level scoping cleanly; the action is a fixed 

791 # ``ssm:GetParameter`` for a single literal parameter Name. 

792 NagSuppressions.add_resource_suppressions( 

793 read_cr, 

794 [ 

795 { 

796 "id": "AwsSolutions-IAM5", 

797 "reason": ( 

798 "Cross-region ssm:GetParameter for " 

799 f"{parameter_name} in the global region. The " 

800 "AwsCustomResource SDK-call policy is scoped to a " 

801 "single fixed action (ssm:GetParameter) with a " 

802 "fixed parameter Name — the Resource: * is the " 

803 "CDK-documented escape hatch because the parameter " 

804 "ARN is not known to the calling principal's " 

805 "region. Effective blast radius: one parameter." 

806 ), 

807 "appliesTo": ["Resource::*"], 

808 }, 

809 ], 

810 apply_to_children=True, 

811 ) 

812 

813 shared_arn = read_cr.get_response_field("Parameter.Value") 

814 

815 # Attach the two policy statements as an inline Policy on the role 

816 # (policy on the role, not the bucket). 

817 iam.Policy( 

818 self, 

819 "SagemakerClusterSharedBucketGrant", 

820 roles=[self.sagemaker_execution_role], 

821 statements=[ 

822 iam.PolicyStatement( 

823 effect=iam.Effect.ALLOW, 

824 actions=[ 

825 "s3:GetObject", 

826 "s3:PutObject", 

827 "s3:DeleteObject", 

828 "s3:ListBucket", 

829 "s3:GetBucketLocation", 

830 ], 

831 resources=[shared_arn, f"{shared_arn}/*"], 

832 ), 

833 iam.PolicyStatement( 

834 effect=iam.Effect.ALLOW, 

835 actions=["kms:Decrypt", "kms:GenerateDataKey"], 

836 resources=["*"], 

837 conditions={ 

838 "StringEquals": { 

839 "kms:ViaService": f"s3.{global_region}.amazonaws.com", 

840 } 

841 }, 

842 ), 

843 ], 

844 ) 

845 

846 # The S3 statement uses an <arn>/* object-key wildcard on the 

847 # literal cluster-shared bucket ARN resolved from SSM — identical 

848 # shape to the regional stack's analogous grant, with the same 

849 # reason text (bucket-scoped RW). 

850 NagSuppressions.add_resource_suppressions( 

851 self.sagemaker_execution_role, 

852 [ 

853 { 

854 "id": "AwsSolutions-IAM5", 

855 "reason": ( 

856 "The SageMaker RW grant on Cluster_Shared_Bucket " 

857 "uses an <arn>/* object-key wildcard on the literal " 

858 "ARN resolved from SSM. The wildcard covers object " 

859 "keys within the single always-on " 

860 "gco-cluster-shared-<account>-<region> bucket." 

861 ), 

862 "appliesTo": [ 

863 {"regex": (r"/^Resource::<ReadClusterSharedBucketArn.*>\/\*$/")}, 

864 ], 

865 }, 

866 ], 

867 apply_to_children=True, 

868 ) 

869 

870 # ================================================================== 

871 # SageMaker Studio domain 

872 # ================================================================== 

873 

874 def _create_studio_domain(self) -> None: 

875 """Create the SageMaker Studio domain bound to the private VPC. 

876 

877 ``auth_mode=IAM`` + ``app_network_access_type=VpcOnly`` keeps Studio 

878 traffic on the private subnets. 

879 ``DefaultUserSettings.ExecutionRole`` points at the role created in 

880 :meth:`_create_execution_role_and_grants`. ``CustomImages`` is 

881 intentionally left unset so Studio falls back to the stock AWS- 

882 published Distribution images (a tested invariant). 

883 

884 ``CustomFileSystemConfigs`` mounts ``self.studio_efs`` at 

885 ``/home/sagemaker-user`` — per-user ``/home/<username>`` isolation 

886 is enforced by the access points that the presigned-URL Lambda 

887 creates lazily on first login. 

888 """ 

889 private_subnets = self.vpc.select_subnets( 

890 subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS 

891 ).subnets 

892 

893 efs_fs_config = sagemaker.CfnDomain.EFSFileSystemConfigProperty( 

894 file_system_id=self.studio_efs.file_system_id, 

895 file_system_path="/home/sagemaker-user", 

896 ) 

897 efs_custom_fs = sagemaker.CfnDomain.CustomFileSystemConfigProperty( 

898 efs_file_system_config=efs_fs_config, 

899 ) 

900 

901 # We also considered adding an ``S3FileSystemConfig`` custom file 

902 # system that would mount the always-on ``Cluster_Shared_Bucket`` 

903 # under ``/mount/cluster-shared``. aws-cdk-lib exposes the 

904 # property and CloudFormation synths it cleanly, but the 

905 # SageMaker Studio service rejects the resource at create time 

906 # with ``Invalid request provided: S3FileSystemConfig for 

907 # SageMaker AI Studio is not supported yet.`` — so we ship 

908 # without the mount. Notebooks access the cluster-shared 

909 # bucket via ``boto3`` (the SageMaker execution role's 

910 # cross-region RW grant in 

911 # :meth:`_grant_sagemaker_role_on_cluster_shared_bucket` already 

912 # authorizes that path). Revisit this block when SageMaker 

913 # Studio lights up S3 custom file systems. 

914 custom_file_systems: list[sagemaker.CfnDomain.CustomFileSystemConfigProperty] = [ 

915 efs_custom_fs, 

916 ] 

917 

918 # Security group for Studio compute — allows all outbound so 

919 # notebooks can reach the internet (pip, git, etc.) via the NAT 

920 # gateway. SageMaker's default VpcOnly security group only permits 

921 # NFS traffic, which blocks all internet access from notebooks. 

922 self.studio_compute_sg = ec2.SecurityGroup( 

923 self, 

924 "StudioComputeSg", 

925 vpc=self.vpc, 

926 description="Allows outbound internet access from Studio notebooks", 

927 allow_all_outbound=True, 

928 ) 

929 

930 default_user_settings = sagemaker.CfnDomain.UserSettingsProperty( 

931 execution_role=self.sagemaker_execution_role.role_arn, 

932 custom_file_system_configs=custom_file_systems, 

933 security_groups=[self.studio_compute_sg.security_group_id], 

934 # ``jupyter_lab_app_settings`` is deliberately omitted so 

935 # ``CustomImages`` stays absent — the template contains no 

936 # SageMaker image resources and no CustomImages 

937 # key on the domain. 

938 ) 

939 

940 self.studio_domain = sagemaker.CfnDomain( 

941 self, 

942 "StudioDomain", 

943 auth_mode="IAM", 

944 app_network_access_type="VpcOnly", 

945 domain_name=f"gco-studio-{self.region}", 

946 subnet_ids=[s.subnet_id for s in private_subnets], 

947 vpc_id=self.vpc.vpc_id, 

948 kms_key_id=self.kms_key.key_id, 

949 default_user_settings=default_user_settings, 

950 ) 

951 

952 # Canvas sub-toggle (UI side): **IAM-only**. The 

953 # ``AmazonSageMakerCanvasFullAccess`` managed policy attached to 

954 # the SageMaker execution role in 

955 # :meth:`_create_execution_role_and_grants` is sufficient to 

956 # surface the Canvas tile on the Studio landing page — when a 

957 # user with that policy opens Studio, SageMaker auto-discovers 

958 # the entitlement and lights up the Canvas launcher. 

959 # 

960 # We intentionally do *not* inject a 

961 # ``DefaultUserSettings.CanvasAppSettings`` block on the domain. 

962 # The CloudFormation ``AWS::SageMaker::Domain`` resource does 

963 # not accept that property (only ``AWS::SageMaker::UserProfile`` 

964 # does), so a property override fails early validation with 

965 # ``Unsupported property [CanvasAppSettings]``. Canvas uses its 

966 # own default workspace artifact locations; operators who want 

967 # to pin per-user Canvas defaults can apply 

968 # ``CanvasAppSettings`` at the ``UserProfile`` level directly. 

969 

970 # The domain validates that the EFS file system has mount targets in 

971 # every subnet before stabilizing. CDK doesn't infer this dependency 

972 # from the file_system_id reference alone, so we add it explicitly. 

973 self.studio_domain.node.add_dependency(self.studio_efs) 

974 

975 CfnOutput( 

976 self, 

977 "StudioDomainName", 

978 value=self.studio_domain.domain_name or "", 

979 description="Name of the SageMaker Studio domain", 

980 ) 

981 

982 # Cleanup custom resource — on stack deletion, removes all user 

983 # profiles from the domain and all access points from the EFS so 

984 # CloudFormation can delete the domain and file system cleanly. 

985 from aws_cdk import CustomResource 

986 from aws_cdk import custom_resources as cr_provider 

987 

988 cleanup_fn = lambda_.Function( 

989 self, 

990 "CleanupFunction", 

991 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME), 

992 handler="handler.handler", 

993 code=lambda_.Code.from_asset("lambda/analytics-cleanup"), 

994 # 15 minutes covers the worst case of multiple async drain 

995 # loops in series: apps (up to ~2 min), spaces (up to ~3 min), 

996 # user profiles (up to ~3 min), SageMaker-managed EFS mount 

997 # targets (up to ~2 min), plus incidental RPC latency and 

998 # security-group cleanup. In the common case (a handful of 

999 # users) this finishes in well under a minute. 

1000 timeout=Duration.minutes(15), 

1001 environment={ 

1002 "DOMAIN_ID": self.studio_domain.attr_domain_id, 

1003 "EFS_ID": self.studio_efs.file_system_id, 

1004 "REGION": self.region, 

1005 "VPC_ID": self.vpc.vpc_id, 

1006 }, 

1007 ) 

1008 

1009 # Use a customer-managed policy instead of an inline policy. 

1010 # Inline policies (created by add_to_role_policy) are separate 

1011 # CloudFormation resources that can be deleted before the custom 

1012 # resource fires during stack deletion. A managed policy attached 

1013 # via the role's managedPolicies property is part of the role 

1014 # resource itself and persists until the role is deleted. 

1015 cleanup_policy = iam.ManagedPolicy( 

1016 self, 

1017 "CleanupFunctionPolicy", 

1018 statements=[ 

1019 iam.PolicyStatement( 

1020 effect=iam.Effect.ALLOW, 

1021 actions=[ 

1022 "sagemaker:ListApps", 

1023 "sagemaker:DeleteApp", 

1024 "sagemaker:ListSpaces", 

1025 "sagemaker:DeleteSpace", 

1026 "sagemaker:ListUserProfiles", 

1027 "sagemaker:DeleteUserProfile", 

1028 "sagemaker:DescribeDomain", 

1029 "elasticfilesystem:DescribeAccessPoints", 

1030 "elasticfilesystem:DeleteAccessPoint", 

1031 "elasticfilesystem:DescribeFileSystems", 

1032 "elasticfilesystem:DescribeMountTargets", 

1033 "elasticfilesystem:DeleteMountTarget", 

1034 "elasticfilesystem:DeleteFileSystem", 

1035 "elasticfilesystem:DeleteFileSystemPolicy", 

1036 "ec2:DescribeSecurityGroups", 

1037 "ec2:DeleteSecurityGroup", 

1038 "ec2:RevokeSecurityGroupIngress", 

1039 "ec2:RevokeSecurityGroupEgress", 

1040 ], 

1041 resources=["*"], 

1042 ) 

1043 ], 

1044 ) 

1045 assert cleanup_fn.role is not None 

1046 cleanup_fn.role.add_managed_policy(cleanup_policy) 

1047 

1048 cleanup_provider = cr_provider.Provider( 

1049 self, 

1050 "CleanupProvider", 

1051 on_event_handler=cleanup_fn, 

1052 ) 

1053 

1054 cleanup_resource = CustomResource( 

1055 self, 

1056 "DomainCleanup", 

1057 service_token=cleanup_provider.service_token, 

1058 ) 

1059 

1060 # The managed policy must not be deleted until after the cleanup 

1061 # custom resource completes. Adding a dependency ensures 

1062 # CloudFormation keeps the policy alive during the Lambda execution. 

1063 cleanup_resource.node.add_dependency(cleanup_policy) 

1064 

1065 # Store reference so _create_presigned_url_lambda can add a 

1066 # dependency after it creates the presigned-URL Lambda. 

1067 self._cleanup_resource = cleanup_resource 

1068 

1069 # Nag suppression for the cleanup Lambda — Resource::* is required 

1070 # because ListUserProfiles/DeleteUserProfile and 

1071 # DescribeAccessPoints/DeleteAccessPoint don't support resource-level 

1072 # scoping (the domain ID and EFS ID are passed via env vars, not ARNs). 

1073 from cdk_nag import NagSuppressions 

1074 

1075 assert cleanup_fn.role is not None # always set for non-imported functions 

1076 NagSuppressions.add_resource_suppressions( 

1077 cleanup_fn.role, 

1078 [ 

1079 { 

1080 "id": "AwsSolutions-IAM5", 

1081 "reason": ( 

1082 "Cleanup Lambda needs Resource::* for " 

1083 "sagemaker:ListUserProfiles/DeleteUserProfile and " 

1084 "efs:DescribeAccessPoints/DeleteAccessPoint. These " 

1085 "APIs don't support resource-level scoping. The " 

1086 "Lambda only runs on stack deletion and is scoped " 

1087 "to the domain ID and EFS ID via environment variables." 

1088 ), 

1089 "appliesTo": ["Resource::*"], 

1090 }, 

1091 { 

1092 "id": "AwsSolutions-IAM4", 

1093 "reason": ( 

1094 "Cleanup Lambda uses AWSLambdaBasicExecutionRole " 

1095 "managed policy for CloudWatch Logs access." 

1096 ), 

1097 "appliesTo": [ 

1098 "Policy::arn:<AWS::Partition>:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole", 

1099 ], 

1100 }, 

1101 ], 

1102 apply_to_children=True, 

1103 ) 

1104 NagSuppressions.add_resource_suppressions( 

1105 cleanup_provider, 

1106 [ 

1107 { 

1108 "id": "AwsSolutions-IAM5", 

1109 "reason": ( 

1110 "CDK Provider framework uses Resource::* for its " 

1111 "internal Lambda invocation policy." 

1112 ), 

1113 "appliesTo": [ 

1114 "Resource::*", 

1115 {"regex": "/^Resource::<CleanupFunction.*\\.Arn>:\\*$/"}, 

1116 ], 

1117 }, 

1118 { 

1119 "id": "AwsSolutions-IAM4", 

1120 "reason": ("CDK Provider framework uses AWSLambdaBasicExecutionRole."), 

1121 "appliesTo": [ 

1122 "Policy::arn:<AWS::Partition>:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole", 

1123 ], 

1124 }, 

1125 { 

1126 "id": "AwsSolutions-L1", 

1127 "reason": ("CDK Provider framework manages its own Lambda runtime version."), 

1128 }, 

1129 ], 

1130 apply_to_children=True, 

1131 ) 

1132 

1133 # ================================================================== 

1134 # EMR Serverless application 

1135 # ================================================================== 

1136 

1137 def _create_emr_app(self) -> None: 

1138 """Create an EMR Serverless Spark application on the private VPC. 

1139 

1140 Pinned ``release_label`` lives in 

1141 ``gco.stacks.constants.EMR_SERVERLESS_RELEASE_LABEL`` so analytics 

1142 workloads get a reproducible Spark runtime across deployments. The 

1143 application's network configuration uses the private 

1144 subnets + a dedicated security group so Spark workers stay on the 

1145 same network perimeter as the Studio notebooks. 

1146 """ 

1147 private_subnet_ids = [ 

1148 s.subnet_id 

1149 for s in self.vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS).subnets 

1150 ] 

1151 

1152 self.emr_security_group = ec2.SecurityGroup( 

1153 self, 

1154 "EmrServerlessSecurityGroup", 

1155 vpc=self.vpc, 

1156 description="SG for EMR Serverless Spark workers", 

1157 allow_all_outbound=True, 

1158 ) 

1159 

1160 self.emr_app = emrserverless.CfnApplication( 

1161 self, 

1162 "EmrServerlessApp", 

1163 name=f"gco-spark-{self.region}", 

1164 release_label=EMR_SERVERLESS_RELEASE_LABEL, 

1165 type="SPARK", 

1166 network_configuration=emrserverless.CfnApplication.NetworkConfigurationProperty( 

1167 subnet_ids=private_subnet_ids, 

1168 security_group_ids=[self.emr_security_group.security_group_id], 

1169 ), 

1170 ) 

1171 

1172 # ================================================================== 

1173 # Cognito pool + client + domain 

1174 # ================================================================== 

1175 

1176 def _create_cognito_pool(self) -> None: 

1177 """Create the Cognito user pool that authenticates SageMaker Studio logins. 

1178 

1179 Password policy, standard threat-protection mode, and self-sign-up- 

1180 disabled flags are configured for SRP-backed Studio logins. The 

1181 attached ``UserPoolClient`` runs SRP auth 

1182 (used by ``gco analytics studio login``) with token revocation 

1183 enabled. The ``UserPoolDomain`` uses the configurable prefix from 

1184 ``analytics_environment.cognito.domain_prefix`` or defaults to 

1185 ``gco-studio-<account>``. 

1186 """ 

1187 self.cognito_pool = cognito.UserPool( 

1188 self, 

1189 "StudioUserPool", 

1190 self_sign_up_enabled=False, 

1191 password_policy=cognito.PasswordPolicy( 

1192 min_length=12, 

1193 require_digits=True, 

1194 require_symbols=True, 

1195 require_uppercase=True, 

1196 require_lowercase=True, 

1197 ), 

1198 sign_in_aliases=cognito.SignInAliases(username=True), 

1199 auto_verify=cognito.AutoVerifiedAttrs(email=True), 

1200 # Replaces the deprecated ``advanced_security_mode`` kwarg 

1201 # (aws-cdk-lib's AdvancedSecurityMode enum is gone as of the 

1202 # Cognito November 2024 tier changes). Lite feature plan — the 

1203 # default — does not support real threat protection, so we set 

1204 # ``NO_ENFORCEMENT`` here to keep the synth warning-free. 

1205 # TODO: operators who want real threat protection should opt 

1206 # into the Essentials or Plus feature plan by also setting 

1207 # ``feature_plan=cognito.FeaturePlan.ESSENTIALS`` (or 

1208 # ``FeaturePlan.PLUS``) and flipping this to 

1209 # ``StandardThreatProtectionMode.FULL_FUNCTION``. That path 

1210 # changes the per-MAU price — see the Cognito pricing doc — 

1211 # which is why the default stays on Lite+NO_ENFORCEMENT. 

1212 standard_threat_protection_mode=(cognito.StandardThreatProtectionMode.NO_ENFORCEMENT), 

1213 removal_policy=self.cognito_removal, 

1214 ) 

1215 

1216 self.cognito_client = self.cognito_pool.add_client( 

1217 "StudioUserPoolClient", 

1218 auth_flows=cognito.AuthFlow( 

1219 user_srp=True, 

1220 admin_user_password=True, 

1221 ), 

1222 prevent_user_existence_errors=True, 

1223 enable_token_revocation=True, 

1224 ) 

1225 

1226 # Domain prefix — default is ``gco-studio-<account>`` (stock default 

1227 # from constants.COGNITO_DOMAIN_PREFIX_DEFAULT + account suffix). 

1228 # The override in cdk.json is used verbatim when non-None, without 

1229 # appending the account id, because operators who override the 

1230 # prefix typically want a short memorable value. 

1231 if self._cognito_domain_prefix_override: 1231 ↛ 1232line 1231 didn't jump to line 1232 because the condition on line 1231 was never true

1232 domain_prefix = self._cognito_domain_prefix_override 

1233 else: 

1234 domain_prefix = f"{COGNITO_DOMAIN_PREFIX_DEFAULT}-{self.account}" 

1235 

1236 self.cognito_domain = self.cognito_pool.add_domain( 

1237 "StudioUserPoolDomain", 

1238 cognito_domain=cognito.CognitoDomainOptions(domain_prefix=domain_prefix), 

1239 ) 

1240 

1241 CfnOutput( 

1242 self, 

1243 "CognitoUserPoolId", 

1244 value=self.cognito_pool.user_pool_id, 

1245 description="ID of the Cognito user pool that gates SageMaker Studio", 

1246 ) 

1247 CfnOutput( 

1248 self, 

1249 "CognitoUserPoolArn", 

1250 value=self.cognito_pool.user_pool_arn, 

1251 description="ARN of the Cognito user pool", 

1252 ) 

1253 CfnOutput( 

1254 self, 

1255 "CognitoUserPoolClientId", 

1256 value=self.cognito_client.user_pool_client_id, 

1257 description="Client ID used by the GCO CLI for SRP auth", 

1258 ) 

1259 

1260 # ================================================================== 

1261 # Presigned-URL Lambda 

1262 # ================================================================== 

1263 

1264 def _create_presigned_url_lambda(self) -> None: 

1265 """Create the ``Presigned_URL_Lambda`` that mints Studio login URLs. 

1266 

1267 Wired into API Gateway's ``/studio/login`` route from 

1268 ``GCOApiGatewayGlobalStack``. The function lives on 

1269 ``GCOAnalyticsStack`` (not the API gateway stack) so its IAM role 

1270 can reference ``SageMaker_Execution_Role.role_arn`` on ``PassRole`` 

1271 and ``Studio_EFS.file_system_arn`` on the EFS access-point actions 

1272 without a cross-stack import. 

1273 

1274 Key configuration: 

1275 

1276 * Runtime: ``LAMBDA_PYTHON_RUNTIME`` from ``gco.stacks.constants``. 

1277 * Timeout: 29 s — API Gateway's maximum integration timeout is 29 

1278 seconds, so matching it here lets the Lambda time out *before* 

1279 API Gateway does, producing a clean HTTP 500 with our opaque 

1280 error token rather than API Gateway's 504. 

1281 * Tracing: ``ACTIVE`` so X-Ray captures the 

1282 ``sagemaker:CreatePresignedDomainUrl`` call. 

1283 * Log group retention: 1 month. 

1284 

1285 IAM scoping: 

1286 

1287 * ``sagemaker:ListDomains`` — no resource-level scoping available; 

1288 scoped with a documented ``Resource::*`` nag suppression. 

1289 * ``sagemaker:DescribeDomain`` + ``CreatePresignedDomainUrl`` + 

1290 ``DescribeUserProfile`` + ``CreateUserProfile`` + ``ListTags`` + 

1291 ``AddTags`` scoped to the domain and user-profile ARN families 

1292 in this region+account. We cannot pin the ``DomainId`` at synth 

1293 time because ``list_domains`` runs at invoke time, so the ARN 

1294 shape includes a wildcard segment covering "any domain id". 

1295 * ``iam:PassRole`` on ``SageMaker_Execution_Role.role_arn`` with a 

1296 ``StringEquals iam:PassedToService=sagemaker.amazonaws.com`` 

1297 condition so the role can only ever be handed to SageMaker. 

1298 * ``elasticfilesystem:DescribeAccessPoints`` + 

1299 ``CreateAccessPoint`` on ``Studio_EFS.file_system_arn`` for the 

1300 lazy per-user access-point creation path in the handler. 

1301 * ``AWSLambdaBasicExecutionRole`` managed policy for the CloudWatch 

1302 Logs + X-Ray write path. 

1303 """ 

1304 from cdk_nag import NagSuppressions 

1305 

1306 # Dedicated IAM role — narrow-scoped, no reuse across other 

1307 # Lambdas. We attach the basic execution role as a managed policy 

1308 # so the nag rule for ``AwsSolutions-IAM4`` is happy; everything 

1309 # else is an inline policy we own entirely. 

1310 self.presigned_url_lambda_role = iam.Role( 

1311 self, 

1312 "PresignedUrlLambdaRole", 

1313 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), 

1314 description=( 

1315 "Execution role for the analytics presigned-URL Lambda. " 

1316 "Scoped to SageMaker domain + user-profile operations, " 

1317 "PassRole on SageMaker_Execution_Role, and EFS access-" 

1318 "point management on Studio_EFS." 

1319 ), 

1320 managed_policies=[ 

1321 iam.ManagedPolicy.from_aws_managed_policy_name( 

1322 "service-role/AWSLambdaBasicExecutionRole" 

1323 ) 

1324 ], 

1325 ) 

1326 

1327 # ListDomains does not support resource-level scoping (AWS API 

1328 # constraint). We use Resource::* and document the effective 

1329 # blast radius in the nag suppression below — one list call per 

1330 # invocation against the region's SageMaker control plane. 

1331 self.presigned_url_lambda_role.add_to_policy( 

1332 iam.PolicyStatement( 

1333 effect=iam.Effect.ALLOW, 

1334 actions=["sagemaker:ListDomains"], 

1335 resources=["*"], 

1336 ) 

1337 ) 

1338 

1339 # Domain + user-profile actions. At synth time we don't know the 

1340 # DomainId (list_domains is an invoke-time call), so the ARN 

1341 # wildcards cover "any domain in this region+account" and "any 

1342 # user profile under any domain in this region+account". The 

1343 # account is still pinned, so the blast radius is bounded to 

1344 # this account's SageMaker Studio installation. 

1345 domain_arn_prefix = f"arn:aws:sagemaker:{self.region}:{self.account}:domain/*" 

1346 user_profile_arn_prefix = f"arn:aws:sagemaker:{self.region}:{self.account}:user-profile/*/*" 

1347 self.presigned_url_lambda_role.add_to_policy( 

1348 iam.PolicyStatement( 

1349 effect=iam.Effect.ALLOW, 

1350 actions=[ 

1351 "sagemaker:DescribeDomain", 

1352 "sagemaker:CreatePresignedDomainUrl", 

1353 "sagemaker:DescribeUserProfile", 

1354 "sagemaker:CreateUserProfile", 

1355 "sagemaker:ListTags", 

1356 "sagemaker:AddTags", 

1357 ], 

1358 resources=[domain_arn_prefix, user_profile_arn_prefix], 

1359 ) 

1360 ) 

1361 

1362 # iam:PassRole — only SageMaker_Execution_Role, only to 

1363 # sagemaker.amazonaws.com. This is what CreateUserProfile passes 

1364 # on the ``ExecutionRole`` field. 

1365 self.presigned_url_lambda_role.add_to_policy( 

1366 iam.PolicyStatement( 

1367 effect=iam.Effect.ALLOW, 

1368 actions=["iam:PassRole"], 

1369 resources=[self.sagemaker_execution_role.role_arn], 

1370 conditions={ 

1371 "StringEquals": { 

1372 "iam:PassedToService": "sagemaker.amazonaws.com", 

1373 } 

1374 }, 

1375 ) 

1376 ) 

1377 

1378 # EFS access-point management — scoped to the Studio_EFS file 

1379 # system. The Lambda creates one access point per Cognito user 

1380 # at first login (lazy-in-Lambda approach). 

1381 self.presigned_url_lambda_role.add_to_policy( 

1382 iam.PolicyStatement( 

1383 effect=iam.Effect.ALLOW, 

1384 actions=[ 

1385 "elasticfilesystem:DescribeAccessPoints", 

1386 "elasticfilesystem:CreateAccessPoint", 

1387 "elasticfilesystem:TagResource", 

1388 ], 

1389 resources=[self.studio_efs.file_system_arn], 

1390 ) 

1391 ) 

1392 

1393 # CloudWatch log group with 1-month retention. We own 

1394 # the group explicitly (rather than letting Lambda auto-create 

1395 # one) so the retention setting is captured in the template. 

1396 presigned_url_log_group = logs.LogGroup( 

1397 self, 

1398 "PresignedUrlLambdaLogGroup", 

1399 retention=logs.RetentionDays.ONE_MONTH, 

1400 removal_policy=RemovalPolicy.DESTROY, 

1401 ) 

1402 

1403 self.presigned_url_lambda = lambda_.Function( 

1404 self, 

1405 "PresignedUrlFunction", 

1406 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME), 

1407 handler="handler.lambda_handler", 

1408 code=lambda_.Code.from_asset("lambda/analytics-presigned-url"), 

1409 role=self.presigned_url_lambda_role, 

1410 timeout=Duration.seconds(29), 

1411 memory_size=256, 

1412 tracing=lambda_.Tracing.ACTIVE, 

1413 log_group=presigned_url_log_group, 

1414 description=( 

1415 "Exchanges a Cognito-authorized event for a presigned " 

1416 "SageMaker Studio URL. Wired into /studio/login by " 

1417 "GCOApiGatewayGlobalStack." 

1418 ), 

1419 environment={ 

1420 "STUDIO_DOMAIN_ID": self.studio_domain.attr_domain_id, 

1421 "SAGEMAKER_EXECUTION_ROLE_ARN": self.sagemaker_execution_role.role_arn, 

1422 "STUDIO_EFS_ID": self.studio_efs.file_system_id, 

1423 "URL_EXPIRES_SECONDS": "300", 

1424 "SESSION_EXPIRES_SECONDS": "43200", 

1425 }, 

1426 ) 

1427 

1428 CfnOutput( 

1429 self, 

1430 "PresignedUrlLambdaArn", 

1431 value=self.presigned_url_lambda.function_arn, 

1432 description=( 

1433 "ARN of the presigned-URL Lambda - consumed by the API " 

1434 "Gateway stack's /studio/login integration." 

1435 ), 

1436 ) 

1437 

1438 # Nag suppressions. Each one carries a literal-ARN or documented 

1439 # wildcard ``applies_to`` and a ``reason`` string explaining why 

1440 # tighter scoping isn't possible. 

1441 NagSuppressions.add_resource_suppressions( 

1442 self.presigned_url_lambda_role, 

1443 [ 

1444 { 

1445 "id": "AwsSolutions-IAM5", 

1446 "reason": ( 

1447 "sagemaker:ListDomains does not support resource-" 

1448 "level scoping — the AWS API only accepts " 

1449 "Resource: *. Effective blast radius: a single " 

1450 "paginated list call per Lambda invocation " 

1451 "against this account's SageMaker control plane " 

1452 "in this region. The remaining SageMaker actions " 

1453 "(DescribeDomain, CreatePresignedDomainUrl, " 

1454 "DescribeUserProfile, CreateUserProfile, " 

1455 "ListTags, AddTags) are scoped to the literal " 

1456 "arn:aws:sagemaker:<region>:<account>:domain/* " 

1457 "and arn:aws:sagemaker:<region>:<account>:" 

1458 "user-profile/*/* ARN families, which is the " 

1459 "tightest we can achieve at synth time because " 

1460 "DomainId is only resolvable at invoke time." 

1461 ), 

1462 "appliesTo": [ 

1463 "Resource::*", 

1464 ("Resource::arn:aws:sagemaker:<AWS::Region>:<AWS::AccountId>:domain/*"), 

1465 ( 

1466 "Resource::arn:aws:sagemaker:<AWS::Region>:" 

1467 "<AWS::AccountId>:user-profile/*/*" 

1468 ), 

1469 ], 

1470 }, 

1471 ], 

1472 apply_to_children=True, 

1473 ) 

1474 

1475 # The cleanup custom resource must fire AFTER the presigned-URL 

1476 # Lambda is deleted during stack destruction. Otherwise the Lambda 

1477 # can recreate user profiles (via in-flight login requests) between 

1478 # cleanup and domain deletion. Adding the dependency here (after 

1479 # the Lambda is created) ensures correct deletion ordering. 

1480 self._cleanup_resource.node.add_dependency(self.presigned_url_lambda) 

1481 

1482 # ================================================================== 

1483 # Nag suppressions 

1484 # ================================================================== 

1485 

1486 def _apply_nag_suppressions(self) -> None: 

1487 """Dispatch to the analytics branch in ``gco/stacks/nag_suppressions.py``. 

1488 

1489 The analytics branch calls ``add_sagemaker_suppressions``, 

1490 ``add_cognito_suppressions``, ``add_emr_serverless_suppressions``, 

1491 ``add_storage_suppressions`` (for ``Studio_Only_Bucket`` + access- 

1492 logs bucket), ``add_lambda_suppressions`` (for the presigned-URL 

1493 Lambda provider framework), and ``add_iam_suppressions`` (for 

1494 cross-region SSM reads + CDK custom resources). 

1495 """ 

1496 apply_all_suppressions( 

1497 self, 

1498 stack_type="analytics", 

1499 regions=None, 

1500 global_region=self.config.get_global_region(), 

1501 api_gateway_region=self.config.get_api_gateway_region(), 

1502 )