Coverage for gco/stacks/analytics_stack.py: 99%
157 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
1"""Analytics stack for GCO - optional ML/analytics environment.
3Instantiated only when ``analytics_environment.enabled=true`` in ``cdk.json``.
4When the toggle is ``false`` (the default), ``app.py`` skips creating it so
5``cdk synth`` emits no SageMaker, EMR Serverless, or Cognito resources.
7Resources (wired in this order):
91. ``_create_kms_key`` — ``Analytics_KMS_Key``
102. ``_create_vpc_and_endpoints`` — private VPC + endpoints
113. ``_create_access_logs_bucket`` — S3 access-logs bucket
124. ``_create_studio_only_bucket`` — ``Studio_Only_Bucket``
135. ``_create_studio_efs`` — ``Studio_EFS``
146. ``_create_execution_role_and_grants`` — ``SageMaker_Execution_Role``
157. ``_grant_sagemaker_role_on_cluster_shared_bucket`` — cross-region IAM grant
168. ``_create_studio_domain`` — ``sagemaker.CfnDomain``
179. ``_create_emr_app`` — ``emrserverless.CfnApplication``
1810. ``_create_cognito_pool`` — Cognito pool + client + domain
1911. ``_create_presigned_url_lambda`` — ``Presigned_URL_Lambda``
2012. ``_apply_nag_suppressions`` — analytics-branch nag dispatch
22The API Gateway ``/studio/*`` wiring that consumes this Lambda lives in
23``gco/stacks/api_gateway_global_stack.py``.
24"""
26from __future__ import annotations
28from typing import Any
30from aws_cdk import (
31 CfnOutput,
32 Duration,
33 RemovalPolicy,
34 Stack,
35)
36from aws_cdk import aws_cognito as cognito
37from aws_cdk import aws_ec2 as ec2
38from aws_cdk import aws_efs as efs
39from aws_cdk import aws_emrserverless as emrserverless
40from aws_cdk import aws_iam as iam
41from aws_cdk import aws_kms as kms
42from aws_cdk import aws_lambda as lambda_
43from aws_cdk import aws_logs as logs
44from aws_cdk import aws_s3 as s3
45from aws_cdk import aws_sagemaker as sagemaker
46from aws_cdk import custom_resources as cr
47from constructs import Construct
49from gco.config.config_loader import ConfigLoader
50from gco.stacks.constants import (
51 CLUSTER_SHARED_SSM_PARAMETER_PREFIX,
52 COGNITO_DOMAIN_PREFIX_DEFAULT,
53 EMR_SERVERLESS_RELEASE_LABEL,
54 LAMBDA_PYTHON_RUNTIME,
55 SAGEMAKER_ROLE_NAME_PREFIX,
56)
57from gco.stacks.nag_suppressions import apply_all_suppressions
59# <pyflowchart-code-diagram> BEGIN - auto-inserted, do not edit
60# Flowchart(s) generated from this file:
61# * ``GCOAnalyticsStack.__init__`` -> ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack___init__.html``
62# (PNG: ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack___init__.png``)
63# * ``GCOAnalyticsStack._create_execution_role_and_grants`` -> ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack__create_execution_role_and_grants.html``
64# (PNG: ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack__create_execution_role_and_grants.png``)
65# * ``GCOAnalyticsStack._create_studio_domain`` -> ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack__create_studio_domain.html``
66# (PNG: ``diagrams/code_diagrams/gco/stacks/analytics_stack.GCOAnalyticsStack__create_studio_domain.png``)
67# Regenerate with ``python diagrams/code_diagrams/generate.py``.
68# <pyflowchart-code-diagram> END
71def _parse_removal(value: str) -> RemovalPolicy:
72 """Map a cdk.json removal-policy string to ``aws_cdk.RemovalPolicy``.
74 Translates ``analytics_environment.{efs,cognito}.removal_policy`` into
75 the matching enum member. Accepts ``"retain"`` / ``"destroy"``
76 (case-insensitive); raises ``ValueError`` on anything else.
77 """
78 normalized = value.strip().lower()
79 if normalized == "retain":
80 return RemovalPolicy.RETAIN
81 if normalized == "destroy":
82 return RemovalPolicy.DESTROY
83 raise ValueError(
84 f"analytics_environment removal_policy must be 'retain' or 'destroy', got {value!r}"
85 )
88class GCOAnalyticsStack(Stack):
89 """Optional ML/analytics environment: SageMaker Studio, EMR Serverless, Cognito.
91 Only instantiated when ``analytics_environment.enabled=true``. Lives in
92 the API gateway region so the presigned-URL Lambda can wire into the
93 existing ``/studio/*`` routes on ``GCOApiGatewayGlobalStack`` without
94 a cross-region hop.
95 """
97 def __init__(
98 self,
99 scope: Construct,
100 construct_id: str,
101 *,
102 config: ConfigLoader,
103 api_gateway_secret_arn: str | None = None,
104 **kwargs: Any,
105 ) -> None:
106 super().__init__(scope, construct_id, **kwargs)
108 self.config = config
109 # ``api_gateway_secret_arn`` is reserved for future auth wiring;
110 # accepted now so the constructor signature is stable.
111 self.api_gateway_secret_arn = api_gateway_secret_arn
113 cfg = config.get_analytics_config()
114 self.hyperpod_enabled: bool = bool(cfg["hyperpod"]["enabled"])
115 self.canvas_enabled: bool = bool(cfg["canvas"]["enabled"])
116 self.efs_removal: RemovalPolicy = _parse_removal(cfg["efs"]["removal_policy"])
117 self.cognito_removal: RemovalPolicy = _parse_removal(cfg["cognito"]["removal_policy"])
118 self._cognito_domain_prefix_override: str | None = cfg["cognito"].get("domain_prefix")
120 # Wiring order is load-bearing — each helper consumes resources from
121 # earlier helpers (EFS ARN → execution role → studio domain, etc.).
122 self._create_kms_key()
123 self._create_vpc_and_endpoints()
124 self._create_access_logs_bucket()
125 self._create_studio_only_bucket()
126 self._create_studio_efs()
127 self._create_execution_role_and_grants()
128 self._grant_sagemaker_role_on_cluster_shared_bucket()
129 self._create_studio_domain()
130 self._create_emr_app()
131 self._create_cognito_pool()
132 self._create_presigned_url_lambda()
133 self._apply_nag_suppressions()
135 # ==================================================================
136 # KMS + VPC
137 # ==================================================================
139 def _create_kms_key(self) -> None:
140 """Create ``Analytics_KMS_Key`` with rotation + 7-day pending window.
142 Customer-managed so every analytics-owned bucket, the Studio EFS,
143 and SageMaker-written artifacts share a single encryption boundary.
144 ``removal_policy=DESTROY`` follows the iteration-loop posture
145 — the 7-day pending window gives recovery headroom without retaining
146 the key past a ``cdk destroy gco-analytics`` cycle.
147 """
148 self.kms_key = kms.Key(
149 self,
150 "AnalyticsKmsKey",
151 description="Analytics_KMS_Key - encrypts analytics S3 buckets, Studio EFS, SageMaker artifacts",
152 enable_key_rotation=True,
153 pending_window=Duration.days(7),
154 removal_policy=RemovalPolicy.DESTROY,
155 )
157 # Grant encrypt/decrypt to service principals that need to operate
158 # on analytics-owned resources encrypted by this key.
159 service_principals = [
160 f"logs.{self.region}.amazonaws.com",
161 "sagemaker.amazonaws.com",
162 "s3.amazonaws.com",
163 "elasticfilesystem.amazonaws.com",
164 ]
165 for principal in service_principals:
166 self.kms_key.add_to_resource_policy(
167 iam.PolicyStatement(
168 sid=f"Allow{principal.split('.')[0].capitalize()}Encrypt",
169 effect=iam.Effect.ALLOW,
170 principals=[iam.ServicePrincipal(principal)],
171 actions=[
172 "kms:Encrypt",
173 "kms:Decrypt",
174 "kms:ReEncrypt*",
175 "kms:GenerateDataKey*",
176 "kms:DescribeKey",
177 ],
178 resources=["*"], # key-policy scope — always the key itself
179 )
180 )
182 def _create_vpc_and_endpoints(self) -> None:
183 """Create a private VPC plus every VPC endpoint Studio needs.
185 Notebooks never land on public subnets (the VPC has none).
186 The nine interface endpoints plus the S3 gateway endpoint
187 keep all Studio/EMR/EFS traffic on the private network. A NAT
188 gateway provides internet egress so notebooks can pip install,
189 git clone, and access external APIs (HuggingFace, PyPI, etc.).
190 """
191 self.vpc = ec2.Vpc(
192 self,
193 "AnalyticsVpc",
194 max_azs=2,
195 nat_gateways=1,
196 subnet_configuration=[
197 ec2.SubnetConfiguration(
198 name="AnalyticsPrivate",
199 subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,
200 cidr_mask=24,
201 ),
202 ec2.SubnetConfiguration(
203 name="AnalyticsPublic",
204 subnet_type=ec2.SubnetType.PUBLIC,
205 cidr_mask=28,
206 ),
207 ],
208 )
210 # Gateway endpoint for S3 — route tables are wired up automatically.
211 self.vpc.add_gateway_endpoint(
212 "S3GatewayEndpoint",
213 service=ec2.GatewayVpcEndpointAwsService.S3,
214 )
216 # Interface endpoints — one per AWS service required by Studio. Each
217 # lands in the VPC's private subnets using the default
218 # VPC-endpoint security group.
219 interface_services: dict[str, ec2.InterfaceVpcEndpointAwsService] = {
220 "SagemakerApiEndpoint": ec2.InterfaceVpcEndpointAwsService.SAGEMAKER_API,
221 "SagemakerRuntimeEndpoint": ec2.InterfaceVpcEndpointAwsService.SAGEMAKER_RUNTIME,
222 "SagemakerStudioEndpoint": ec2.InterfaceVpcEndpointAwsService.SAGEMAKER_STUDIO,
223 "SagemakerNotebookEndpoint": ec2.InterfaceVpcEndpointAwsService.SAGEMAKER_NOTEBOOK,
224 "StsEndpoint": ec2.InterfaceVpcEndpointAwsService.STS,
225 "CloudWatchLogsEndpoint": ec2.InterfaceVpcEndpointAwsService.CLOUDWATCH_LOGS,
226 "EcrEndpoint": ec2.InterfaceVpcEndpointAwsService.ECR,
227 "EcrDockerEndpoint": ec2.InterfaceVpcEndpointAwsService.ECR_DOCKER,
228 "EfsEndpoint": ec2.InterfaceVpcEndpointAwsService.ELASTIC_FILESYSTEM,
229 }
230 for construct_id, service in interface_services.items():
231 self.vpc.add_interface_endpoint(
232 construct_id,
233 service=service,
234 subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
235 )
237 # ==================================================================
238 # S3 buckets
239 # ==================================================================
241 def _create_access_logs_bucket(self) -> None:
242 """Create the dedicated access-logs bucket for ``Studio_Only_Bucket``.
244 Server-side encryption uses S3-managed keys (SSE-S3) because S3
245 server-access-log delivery does not support KMS-encrypted destinations
246 without additional log-delivery role plumbing — the standard pattern
247 is SSE-S3 for the log sink plus KMS for the bucket it logs. The
248 resulting ``AwsSolutions-S1`` nag finding for the log sink targeting
249 itself is scoped on the bucket construct by
250 ``add_storage_suppressions`` via the analytics nag branch.
251 """
252 self.access_logs_bucket = s3.Bucket(
253 self,
254 "AnalyticsAccessLogsBucket",
255 encryption=s3.BucketEncryption.S3_MANAGED,
256 block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
257 enforce_ssl=True,
258 versioned=True,
259 removal_policy=RemovalPolicy.DESTROY,
260 auto_delete_objects=True,
261 lifecycle_rules=[
262 s3.LifecycleRule(
263 id="ExpireAccessLogs",
264 enabled=True,
265 expiration=Duration.days(90),
266 )
267 ],
268 )
270 def _create_studio_only_bucket(self) -> None:
271 """Create ``Studio_Only_Bucket`` for notebook-private scratch + outputs.
273 Named ``gco-analytics-studio-<account>-<region>`` so the cdk-nag
274 deny-list assertion (``arn:aws:s3:::gco-analytics-studio-*``) stays
275 stable. KMS-encrypted with ``self.kms_key``; every access path goes
276 through the ``SageMaker_Execution_Role`` grant — no other principal
277 is granted access.
278 """
279 self.studio_only_bucket = s3.Bucket(
280 self,
281 "StudioOnlyBucket",
282 bucket_name=f"gco-analytics-studio-{self.account}-{self.region}",
283 encryption=s3.BucketEncryption.KMS,
284 encryption_key=self.kms_key,
285 bucket_key_enabled=True,
286 block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
287 enforce_ssl=True,
288 versioned=True,
289 removal_policy=RemovalPolicy.DESTROY,
290 auto_delete_objects=True,
291 server_access_logs_bucket=self.access_logs_bucket,
292 server_access_logs_prefix="studio-only/",
293 )
295 # Belt-and-suspenders Deny for insecure transport, duplicating the
296 # ``enforce_ssl=True`` semantics with a verifiable SID in the
297 # synthesized template (mirrors the ``DenyInsecureTransport`` pattern
298 # used by ``Cluster_Shared_Bucket`` in ``GCOGlobalStack``).
299 self.studio_only_bucket.add_to_resource_policy(
300 iam.PolicyStatement(
301 sid="DenyInsecureTransport",
302 effect=iam.Effect.DENY,
303 principals=[iam.AnyPrincipal()],
304 actions=["s3:*"],
305 resources=[
306 self.studio_only_bucket.bucket_arn,
307 f"{self.studio_only_bucket.bucket_arn}/*",
308 ],
309 conditions={"Bool": {"aws:SecureTransport": "false"}},
310 )
311 )
313 # ==================================================================
314 # Studio EFS
315 # ==================================================================
317 def _create_studio_efs(self) -> None:
318 """Create ``Studio_EFS`` with KMS encryption + TLS in transit.
320 Per-user access points are created lazily by the presigned-URL
321 Lambda on first profile creation. No access points are defined
322 here, so the file system's ``/`` root is effectively inaccessible
323 until the Lambda materializes a per-user AP.
325 The dedicated security group only allows the VPC's private
326 CIDR on TCP/2049 (NFS). SageMaker Studio mount traffic originates
327 from the Studio compute subnet, which shares the VPC with this EFS.
328 """
329 self.studio_efs_security_group = ec2.SecurityGroup(
330 self,
331 "StudioEfsSecurityGroup",
332 vpc=self.vpc,
333 description="SG for Studio_EFS - allows NFS from the analytics VPC only",
334 allow_all_outbound=False,
335 )
336 self.studio_efs_security_group.add_ingress_rule(
337 peer=ec2.Peer.ipv4(self.vpc.vpc_cidr_block),
338 connection=ec2.Port.tcp(2049),
339 description="NFS from analytics VPC private subnets",
340 )
342 self.studio_efs = efs.FileSystem(
343 self,
344 "StudioEfs",
345 vpc=self.vpc,
346 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
347 encrypted=True,
348 kms_key=self.kms_key,
349 enable_automatic_backups=True,
350 removal_policy=self.efs_removal,
351 security_group=self.studio_efs_security_group,
352 )
354 # ==================================================================
355 # SageMaker execution role + grants
356 # ==================================================================
358 def _create_execution_role_and_grants(self) -> None:
359 """Create ``SageMaker_Execution_Role`` and attach its (non-cluster-shared) grants.
361 Role name begins with ``AmazonSageMaker`` — SageMaker
362 requires this prefix for any role used by a Studio domain. Grants
363 attached here:
365 * RW on ``Studio_Only_Bucket`` + KMS on ``Analytics_KMS_Key``
366 * Read-only ``execute-api:Invoke`` on GCO API Gateway ``/api/v1/*`` GET routes
367 * ``sqs:SendMessage`` on regional job queues (wildcard ARN pattern)
368 * ``ssm:GetParameter`` on the ``Cluster_Shared_Bucket`` metadata
369 parameters in the global region — lets notebooks look up the
370 bucket name/arn/region at runtime without a per-user export step
371 * EFS mount actions on ``Studio_EFS`` (specific AP arn is added by
372 the presigned-URL Lambda at runtime; the role-level grant here is
373 scoped to the EFS ARN)
374 * HyperPod training-job actions when ``hyperpod.enabled=true``
375 * AWS-managed ``AmazonSageMakerCanvasFullAccess`` when
376 ``canvas.enabled=true`` (opt-in no-code ML app)
377 * AWS-managed ``AmazonSageMakerFullAccess`` — always attached
378 whenever analytics is enabled. Covers the full SageMaker
379 control-plane surface including MLflow Apps
380 (``CreateMlflowApp``/``ListMlflowApps``/``DescribeMlflowApp``),
381 MLflow Tracking Servers, Model Registry, Studio space/app
382 lifecycle, and adjacent services (S3, ECR, CloudWatch Logs,
383 etc.) that SageMaker needs to launch training jobs, create
384 apps, and render the Studio IDE. We pair the managed policy
385 with an inline ``sagemaker-mlflow:*`` statement (next block)
386 because the managed policy does not cover the
387 ``sagemaker-mlflow`` data-plane namespace the MLflow SDK
388 talks to. MLflow does not have its own sub-toggle — the
389 managed policy replaces our previous enumerated
390 ``sagemaker:*MlflowTrackingServer*`` inline grant.
392 The ``Cluster_Shared_Bucket`` grant lives in its own helper
393 (:meth:`_grant_sagemaker_role_on_cluster_shared_bucket`) because the
394 bucket ARN is resolved via a cross-region SSM read.
395 """
396 self.sagemaker_execution_role = iam.Role(
397 self,
398 "SagemakerExecutionRole",
399 role_name=f"{SAGEMAKER_ROLE_NAME_PREFIX}-gco-analytics-exec-{self.region}",
400 assumed_by=iam.ServicePrincipal("sagemaker.amazonaws.com"),
401 description=(
402 "SageMaker_Execution_Role - assumed by notebooks in the Studio "
403 "domain. Grants RW on Studio_Only_Bucket and (via a separate "
404 "cross-region policy) Cluster_Shared_Bucket, plus read-only GCO "
405 "API access, SQS job submission, and cross-region ssm:GetParameter "
406 "on the Cluster_Shared_Bucket metadata parameters."
407 ),
408 )
410 # Bucket + KMS grants — studio-only scratch space. Analytics_KMS_Key
411 # already has encrypt/decrypt in its key policy for the sagemaker
412 # service principal, but role-level grants are still required for
413 # IAM-side authorization per the double-auth model.
414 self.studio_only_bucket.grant_read_write(self.sagemaker_execution_role)
415 self.kms_key.grant_encrypt_decrypt(self.sagemaker_execution_role)
417 # SageMaker needs CreateGrant on the KMS key to delegate encryption
418 # to EBS when creating space volumes. The grant is scoped to the
419 # key and conditioned on the grantee being an AWS service.
420 self.kms_key.grant(
421 self.sagemaker_execution_role,
422 "kms:CreateGrant",
423 "kms:DescribeKey",
424 )
426 # GCO API scope — notebooks need both read-only GET operations
427 # (list jobs, describe endpoints, fetch health) and job/inference
428 # submission actions (POST manifests, PUT template updates, DELETE
429 # jobs). Grant the full ``/api/v1/*`` method surface instead of
430 # GET-only so users can submit new jobs, manage templates, and
431 # tear things down from inside a notebook without bouncing
432 # through a service account.
433 #
434 # The exact API id is not known here (it lives in the api-gateway
435 # stack and is discovered through SSM or CfnOutput at synth time
436 # — see the api_gateway_global_stack wiring). Scope to the
437 # api-gateway region with any REST API id for now; tighter scope
438 # is applied once ``AnalyticsApiConfig`` is wired in.
439 api_gw_region = self.config.get_api_gateway_region()
440 self.sagemaker_execution_role.add_to_policy(
441 iam.PolicyStatement(
442 effect=iam.Effect.ALLOW,
443 actions=["execute-api:Invoke"],
444 resources=[
445 # ``*/prod/*/api/v1/*`` — any API id, any HTTP method
446 # (GET/POST/PUT/DELETE/PATCH), any path below
447 # /api/v1/. /studio/* is explicitly excluded; Canvas
448 # users go through their own Cognito-authorized
449 # ``/studio/login`` route.
450 f"arn:aws:execute-api:{api_gw_region}:{self.account}:*/prod/*/api/v1/*",
451 # ``/inference/*`` proxies through to regional ALBs
452 # for in-cluster model endpoints — notebooks need
453 # the full method surface here too.
454 f"arn:aws:execute-api:{api_gw_region}:{self.account}:*/prod/*/inference/*",
455 ],
456 )
457 )
459 # SQS job submission — scoped to the regional queue name pattern
460 # ``<project>-jobs-<region>`` written by
461 # ``GCORegionalStack._create_sqs_queue``. The exact region isn't
462 # known at synth time (queues live in regional stacks), so we use
463 # ``*`` in the region component with the project name fixed.
464 project_name = self.config.get_project_name()
465 self.sagemaker_execution_role.add_to_policy(
466 iam.PolicyStatement(
467 effect=iam.Effect.ALLOW,
468 actions=["sqs:SendMessage"],
469 resources=[
470 f"arn:aws:sqs:*:{self.account}:{project_name}-jobs-*",
471 ],
472 )
473 )
475 # ssm:GetParameter on the Cluster_Shared_Bucket metadata params. The
476 # three parameters (name/arn/region) live in the global region where
477 # GCOGlobalStack is deployed, not in the analytics region. Scoping
478 # to the CLUSTER_SHARED_SSM_PARAMETER_PREFIX tree under the global
479 # region means a notebook can fetch the bucket name at runtime via
480 # boto3.client('ssm', region_name='<global-region>').get_parameter(
481 # Name='/gco/cluster-shared-bucket/name')['Parameter']['Value']
482 # without any JupyterLab-terminal export step.
483 global_region = self.config.get_global_region()
484 self.sagemaker_execution_role.add_to_policy(
485 iam.PolicyStatement(
486 effect=iam.Effect.ALLOW,
487 actions=["ssm:GetParameter", "ssm:GetParameters"],
488 resources=[
489 f"arn:aws:ssm:{global_region}:{self.account}:parameter{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/*",
490 ],
491 )
492 )
494 # EFS mount actions — scoped to the Studio EFS file-system ARN.
495 self.sagemaker_execution_role.add_to_policy(
496 iam.PolicyStatement(
497 effect=iam.Effect.ALLOW,
498 actions=[
499 "elasticfilesystem:ClientMount",
500 "elasticfilesystem:ClientWrite",
501 "elasticfilesystem:ClientRootAccess",
502 ],
503 resources=[self.studio_efs.file_system_arn],
504 )
505 )
507 # DescribeMountTargets does not support resource-level scoping —
508 # SageMaker calls it during user profile provisioning to validate
509 # the EFS mount configuration.
510 self.sagemaker_execution_role.add_to_policy(
511 iam.PolicyStatement(
512 effect=iam.Effect.ALLOW,
513 actions=[
514 "elasticfilesystem:DescribeMountTargets",
515 "elasticfilesystem:DescribeFileSystems",
516 ],
517 resources=["*"],
518 )
519 )
521 # SageMaker Studio UI actions — the execution role is assumed by
522 # the Studio notebook runtime and needs these to render the IDE,
523 # list spaces/apps, and manage its own lifecycle.
524 self.sagemaker_execution_role.add_to_policy(
525 iam.PolicyStatement(
526 effect=iam.Effect.ALLOW,
527 actions=[
528 "sagemaker:DescribeDomain",
529 "sagemaker:DescribeUserProfile",
530 "sagemaker:CreatePresignedDomainUrl",
531 "sagemaker:ListSpaces",
532 "sagemaker:ListApps",
533 "sagemaker:DescribeApp",
534 "sagemaker:DescribeSpace",
535 "sagemaker:CreateApp",
536 "sagemaker:DeleteApp",
537 "sagemaker:CreateSpace",
538 "sagemaker:DeleteSpace",
539 "sagemaker:UpdateSpace",
540 "sagemaker:ListTags",
541 "sagemaker:AddTags",
542 ],
543 resources=[
544 f"arn:aws:sagemaker:{self.region}:{self.account}:domain/*",
545 f"arn:aws:sagemaker:{self.region}:{self.account}:user-profile/*/*",
546 f"arn:aws:sagemaker:{self.region}:{self.account}:space/*/*",
547 f"arn:aws:sagemaker:{self.region}:{self.account}:app/*/*/*/*",
548 ],
549 )
550 )
552 # EMR Serverless — allow the execution role to discover, connect to,
553 # and manage the EMR Serverless application from Studio's Data panel.
554 self.sagemaker_execution_role.add_to_policy(
555 iam.PolicyStatement(
556 effect=iam.Effect.ALLOW,
557 actions=[
558 "emr-serverless:ListApplications",
559 "emr-serverless:GetApplication",
560 "emr-serverless:CreateApplication",
561 "emr-serverless:StartApplication",
562 "emr-serverless:StopApplication",
563 "emr-serverless:StartJobRun",
564 "emr-serverless:GetJobRun",
565 "emr-serverless:ListJobRuns",
566 "emr-serverless:CancelJobRun",
567 "emr-serverless:GetDashboardForJobRun",
568 "emr-serverless:AccessLivyEndpoints",
569 ],
570 resources=["*"],
571 )
572 )
574 # SageMaker-managed MLflow + Model Registry + MLflow Apps.
575 #
576 # We attach the AWS-managed ``AmazonSageMakerFullAccess`` policy
577 # for two reasons:
578 #
579 # 1. MLflow Apps (the newer Studio panel, separate from MLflow
580 # Tracking Servers) requires ``sagemaker:CreateMlflowApp``/
581 # ``ListMlflowApps``/``DescribeMlflowApp`` etc. The action
582 # surface is evolving quickly and the managed policy tracks
583 # it. Enumerating it inline would drift.
584 # 2. SageMaker Model Registry (``sagemaker:*ModelPackage*``),
585 # Studio space/app lifecycle, training-job submission, and
586 # the "related-services" helpers (S3, ECR, CloudWatch Logs)
587 # are already covered by the managed policy — keeping them
588 # inline duplicated the managed policy and kept us in a
589 # catch-up loop whenever SageMaker shipped a new feature.
590 #
591 # The managed policy is ``Resource: *`` by design; the trade-off
592 # (broader-than-least-privilege inside the role) is
593 # acknowledged with a nag suppression below. The inline
594 # ``sagemaker-mlflow:*`` statement that follows is still
595 # required because the managed policy does NOT cover the
596 # ``sagemaker-mlflow`` data-plane namespace — that's what the
597 # MLflow SDK talks to over SigV4 for ``log_metric``,
598 # ``log_artifact``, ``register_model``, etc.
599 from gco.stacks.nag_suppressions import suppress_managed_policy_opt_in
601 self.sagemaker_execution_role.add_managed_policy(
602 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonSageMakerFullAccess")
603 )
604 suppress_managed_policy_opt_in(
605 self.sagemaker_execution_role,
606 managed_policy_name="AmazonSageMakerFullAccess",
607 reason=(
608 "AmazonSageMakerFullAccess is attached to "
609 "SageMaker_Execution_Role when analytics_environment.enabled=true. "
610 "The managed policy covers MLflow Apps, MLflow Tracking "
611 "Servers, SageMaker Model Registry, Studio space/app "
612 "lifecycle, training-job submission, and the cross-service "
613 "helpers (S3, ECR, CloudWatch Logs) SageMaker needs to "
614 "render the IDE and run jobs. Enumerating this surface "
615 "inline drifts out of date within weeks — tracking the "
616 "AWS-managed policy is the supported path. The inline "
617 "``sagemaker-mlflow:*`` statement that follows covers "
618 "the data-plane namespace the managed policy does not "
619 "include. Users who want a locked-down alternative can "
620 "disable the analytics environment."
621 ),
622 )
624 # MLflow SDK data-plane (``sagemaker-mlflow:*``) — required for
625 # ``mlflow.log_metric``, ``mlflow.log_artifact``,
626 # ``mlflow.register_model``, etc. to round-trip through the
627 # SageMaker-managed tracking server over SigV4. The managed
628 # policy above covers the ``sagemaker:*`` control-plane
629 # namespace but not ``sagemaker-mlflow:*`` (a separate service
630 # prefix), so we keep this inline and scope it to the
631 # api-gateway region where the tracking server and MLflow apps
632 # live.
633 self.sagemaker_execution_role.add_to_policy(
634 iam.PolicyStatement(
635 effect=iam.Effect.ALLOW,
636 actions=["sagemaker-mlflow:*"],
637 resources=[
638 f"arn:aws:sagemaker:{api_gw_region}:{self.account}:mlflow-tracking-server/*",
639 f"arn:aws:sagemaker:{api_gw_region}:{self.account}:mlflow-app/*",
640 ],
641 )
642 )
644 # MLflow's SigV4 plug-in exchanges STS ``GetCallerIdentity`` on
645 # every request — the execution role needs that on ``*``.
646 # ``sts:GetCallerIdentity`` does not support resource-level
647 # scoping, so Resource: * is the only valid value.
648 self.sagemaker_execution_role.add_to_policy(
649 iam.PolicyStatement(
650 effect=iam.Effect.ALLOW,
651 actions=["sts:GetCallerIdentity"],
652 resources=["*"],
653 )
654 )
656 # HyperPod sub-toggle — additional SageMaker actions for training-job
657 # submission and cluster-instance lifecycle management.
658 # ``resources=["*"]`` is the documented scope; the HyperPod actions
659 # themselves encode the per-training-job authorization model.
660 if self.hyperpod_enabled:
661 self.sagemaker_execution_role.add_to_policy(
662 iam.PolicyStatement(
663 effect=iam.Effect.ALLOW,
664 actions=[
665 "sagemaker:CreateTrainingJob",
666 "sagemaker:DescribeTrainingJob",
667 "sagemaker:StopTrainingJob",
668 "sagemaker:ClusterInstance",
669 "sagemaker:ClusterInstanceGroup",
670 "sagemaker:DescribeClusterNode",
671 "sagemaker:ListClusterNodes",
672 ],
673 resources=["*"],
674 )
675 )
677 # Canvas sub-toggle — attach AWS-managed ``AmazonSageMakerCanvasFullAccess``
678 # to the execution role so users can launch the Canvas no-code ML
679 # app from inside Studio. The managed policy is used deliberately
680 # (rather than enumerating each action) because Canvas's per-feature
681 # permission surface — Bedrock for generative AI, Forecast for time
682 # series, Rekognition for image classification, S3 writes for
683 # datasets, Athena for SQL sources, etc. — is large and evolves with
684 # every Canvas release. Tracking AWS's managed policy means we pick
685 # up new Canvas capabilities automatically without shipping a CDK
686 # change. The trade-off (broader-than-least-privilege inside the
687 # role) is acknowledged with a dedicated nag suppression below.
688 #
689 # The matching ``CanvasAppSettings`` override on the Studio domain
690 # lives in ``_create_studio_domain`` so the Canvas tile shows up
691 # on the Studio landing page when the toggle is on.
692 if self.canvas_enabled:
693 self.sagemaker_execution_role.add_managed_policy(
694 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonSageMakerCanvasFullAccess")
695 )
697 suppress_managed_policy_opt_in(
698 self.sagemaker_execution_role,
699 managed_policy_name="AmazonSageMakerCanvasFullAccess",
700 reason=(
701 "AmazonSageMakerCanvasFullAccess is attached to "
702 "SageMaker_Execution_Role when analytics_environment.canvas.enabled=true. "
703 "Canvas is an opt-in sub-toggle (off by default) and its managed "
704 "policy is preferred over an enumerated least-privilege policy "
705 "because Canvas's cross-service permission surface (Bedrock, "
706 "Forecast, Rekognition, Athena, S3 dataset writes, etc.) evolves "
707 "with every Canvas release — tracking the managed policy keeps "
708 "Canvas functional as AWS ships new features. Users who want a "
709 "locked-down alternative can keep the toggle off."
710 ),
711 )
713 # EFS resource policy — must include DescribeMountTargets without
714 # the AccessedViaMountTarget condition because SageMaker calls it
715 # during user-profile provisioning. Using AnyPrincipal (AWS:*)
716 # ensures all account roles (execution role, cleanup Lambda, and
717 # the SageMaker service) are covered. Security is enforced by the
718 # VPC security group (NFS traffic only from within the VPC) and
719 # IAM policies on each role — the resource policy is permissive
720 # by design to avoid the intersection-model blocking control-plane
721 # calls.
722 # Note: DescribeAccessPoints/DescribeFileSystems CANNOT be in EFS
723 # resource policies (EFS rejects them). Those rely on IAM only.
724 self.studio_efs.add_to_resource_policy(
725 iam.PolicyStatement(
726 effect=iam.Effect.ALLOW,
727 principals=[iam.AnyPrincipal()],
728 actions=[
729 "elasticfilesystem:ClientMount",
730 "elasticfilesystem:ClientWrite",
731 "elasticfilesystem:ClientRootAccess",
732 "elasticfilesystem:DescribeMountTargets",
733 "elasticfilesystem:DescribeFileSystems",
734 "elasticfilesystem:DeleteAccessPoint",
735 "elasticfilesystem:DeleteMountTarget",
736 "elasticfilesystem:DeleteFileSystem",
737 "elasticfilesystem:DeleteFileSystemPolicy",
738 ],
739 )
740 )
742 def _grant_sagemaker_role_on_cluster_shared_bucket(self) -> None:
743 """Attach RW + KMS on ``Cluster_Shared_Bucket`` to ``SageMaker_Execution_Role``.
745 The bucket lives in ``GCOGlobalStack`` in the global region. Its
746 ARN is resolved at synth time via an ``AwsCustomResource`` that
747 issues ``ssm:GetParameter`` against the global region — mirroring
748 the pattern used by ``GCORegionalStack._resolve_cluster_shared_bucket_from_ssm``.
750 Two statements attach to the role:
752 1. S3: ``GetObject``/``PutObject``/``DeleteObject``/``ListBucket``/
753 ``GetBucketLocation`` on ``<arn>`` + ``<arn>/*``.
754 2. KMS: ``Decrypt``/``GenerateDataKey`` with a
755 ``kms:ViaService=s3.<global-region>.amazonaws.com`` condition.
757 This is a role-side policy — the bucket policy is owned
758 exclusively by ``GCOGlobalStack``.
759 """
760 from cdk_nag import NagSuppressions
762 global_region = self.config.get_global_region()
763 parameter_name = f"{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/arn"
765 read_cr = cr.AwsCustomResource(
766 self,
767 "ReadClusterSharedBucketArn",
768 on_create=cr.AwsSdkCall(
769 service="SSM",
770 action="getParameter",
771 parameters={"Name": parameter_name},
772 region=global_region,
773 physical_resource_id=cr.PhysicalResourceId.of("analytics-cluster-shared-arn"),
774 ),
775 on_update=cr.AwsSdkCall(
776 service="SSM",
777 action="getParameter",
778 parameters={"Name": parameter_name},
779 region=global_region,
780 physical_resource_id=cr.PhysicalResourceId.of("analytics-cluster-shared-arn"),
781 ),
782 policy=cr.AwsCustomResourcePolicy.from_sdk_calls(
783 resources=cr.AwsCustomResourcePolicy.ANY_RESOURCE
784 ),
785 )
787 # Scoped suppression: same shape as
788 # ``GCORegionalStack._resolve_cluster_shared_bucket_from_ssm``. The
789 # CR policy is ``Resource::*`` because cross-region SSM does not
790 # support resource-level scoping cleanly; the action is a fixed
791 # ``ssm:GetParameter`` for a single literal parameter Name.
792 NagSuppressions.add_resource_suppressions(
793 read_cr,
794 [
795 {
796 "id": "AwsSolutions-IAM5",
797 "reason": (
798 "Cross-region ssm:GetParameter for "
799 f"{parameter_name} in the global region. The "
800 "AwsCustomResource SDK-call policy is scoped to a "
801 "single fixed action (ssm:GetParameter) with a "
802 "fixed parameter Name — the Resource: * is the "
803 "CDK-documented escape hatch because the parameter "
804 "ARN is not known to the calling principal's "
805 "region. Effective blast radius: one parameter."
806 ),
807 "appliesTo": ["Resource::*"],
808 },
809 ],
810 apply_to_children=True,
811 )
813 shared_arn = read_cr.get_response_field("Parameter.Value")
815 # Attach the two policy statements as an inline Policy on the role
816 # (policy on the role, not the bucket).
817 iam.Policy(
818 self,
819 "SagemakerClusterSharedBucketGrant",
820 roles=[self.sagemaker_execution_role],
821 statements=[
822 iam.PolicyStatement(
823 effect=iam.Effect.ALLOW,
824 actions=[
825 "s3:GetObject",
826 "s3:PutObject",
827 "s3:DeleteObject",
828 "s3:ListBucket",
829 "s3:GetBucketLocation",
830 ],
831 resources=[shared_arn, f"{shared_arn}/*"],
832 ),
833 iam.PolicyStatement(
834 effect=iam.Effect.ALLOW,
835 actions=["kms:Decrypt", "kms:GenerateDataKey"],
836 resources=["*"],
837 conditions={
838 "StringEquals": {
839 "kms:ViaService": f"s3.{global_region}.amazonaws.com",
840 }
841 },
842 ),
843 ],
844 )
846 # The S3 statement uses an <arn>/* object-key wildcard on the
847 # literal cluster-shared bucket ARN resolved from SSM — identical
848 # shape to the regional stack's analogous grant, with the same
849 # reason text (bucket-scoped RW).
850 NagSuppressions.add_resource_suppressions(
851 self.sagemaker_execution_role,
852 [
853 {
854 "id": "AwsSolutions-IAM5",
855 "reason": (
856 "The SageMaker RW grant on Cluster_Shared_Bucket "
857 "uses an <arn>/* object-key wildcard on the literal "
858 "ARN resolved from SSM. The wildcard covers object "
859 "keys within the single always-on "
860 "gco-cluster-shared-<account>-<region> bucket."
861 ),
862 "appliesTo": [
863 {"regex": (r"/^Resource::<ReadClusterSharedBucketArn.*>\/\*$/")},
864 ],
865 },
866 ],
867 apply_to_children=True,
868 )
870 # ==================================================================
871 # SageMaker Studio domain
872 # ==================================================================
874 def _create_studio_domain(self) -> None:
875 """Create the SageMaker Studio domain bound to the private VPC.
877 ``auth_mode=IAM`` + ``app_network_access_type=VpcOnly`` keeps Studio
878 traffic on the private subnets.
879 ``DefaultUserSettings.ExecutionRole`` points at the role created in
880 :meth:`_create_execution_role_and_grants`. ``CustomImages`` is
881 intentionally left unset so Studio falls back to the stock AWS-
882 published Distribution images (a tested invariant).
884 ``CustomFileSystemConfigs`` mounts ``self.studio_efs`` at
885 ``/home/sagemaker-user`` — per-user ``/home/<username>`` isolation
886 is enforced by the access points that the presigned-URL Lambda
887 creates lazily on first login.
888 """
889 private_subnets = self.vpc.select_subnets(
890 subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS
891 ).subnets
893 efs_fs_config = sagemaker.CfnDomain.EFSFileSystemConfigProperty(
894 file_system_id=self.studio_efs.file_system_id,
895 file_system_path="/home/sagemaker-user",
896 )
897 efs_custom_fs = sagemaker.CfnDomain.CustomFileSystemConfigProperty(
898 efs_file_system_config=efs_fs_config,
899 )
901 # We also considered adding an ``S3FileSystemConfig`` custom file
902 # system that would mount the always-on ``Cluster_Shared_Bucket``
903 # under ``/mount/cluster-shared``. aws-cdk-lib exposes the
904 # property and CloudFormation synths it cleanly, but the
905 # SageMaker Studio service rejects the resource at create time
906 # with ``Invalid request provided: S3FileSystemConfig for
907 # SageMaker AI Studio is not supported yet.`` — so we ship
908 # without the mount. Notebooks access the cluster-shared
909 # bucket via ``boto3`` (the SageMaker execution role's
910 # cross-region RW grant in
911 # :meth:`_grant_sagemaker_role_on_cluster_shared_bucket` already
912 # authorizes that path). Revisit this block when SageMaker
913 # Studio lights up S3 custom file systems.
914 custom_file_systems: list[sagemaker.CfnDomain.CustomFileSystemConfigProperty] = [
915 efs_custom_fs,
916 ]
918 # Security group for Studio compute — allows all outbound so
919 # notebooks can reach the internet (pip, git, etc.) via the NAT
920 # gateway. SageMaker's default VpcOnly security group only permits
921 # NFS traffic, which blocks all internet access from notebooks.
922 self.studio_compute_sg = ec2.SecurityGroup(
923 self,
924 "StudioComputeSg",
925 vpc=self.vpc,
926 description="Allows outbound internet access from Studio notebooks",
927 allow_all_outbound=True,
928 )
930 default_user_settings = sagemaker.CfnDomain.UserSettingsProperty(
931 execution_role=self.sagemaker_execution_role.role_arn,
932 custom_file_system_configs=custom_file_systems,
933 security_groups=[self.studio_compute_sg.security_group_id],
934 # ``jupyter_lab_app_settings`` is deliberately omitted so
935 # ``CustomImages`` stays absent — the template contains no
936 # SageMaker image resources and no CustomImages
937 # key on the domain.
938 )
940 self.studio_domain = sagemaker.CfnDomain(
941 self,
942 "StudioDomain",
943 auth_mode="IAM",
944 app_network_access_type="VpcOnly",
945 domain_name=f"gco-studio-{self.region}",
946 subnet_ids=[s.subnet_id for s in private_subnets],
947 vpc_id=self.vpc.vpc_id,
948 kms_key_id=self.kms_key.key_id,
949 default_user_settings=default_user_settings,
950 )
952 # Canvas sub-toggle (UI side): **IAM-only**. The
953 # ``AmazonSageMakerCanvasFullAccess`` managed policy attached to
954 # the SageMaker execution role in
955 # :meth:`_create_execution_role_and_grants` is sufficient to
956 # surface the Canvas tile on the Studio landing page — when a
957 # user with that policy opens Studio, SageMaker auto-discovers
958 # the entitlement and lights up the Canvas launcher.
959 #
960 # We intentionally do *not* inject a
961 # ``DefaultUserSettings.CanvasAppSettings`` block on the domain.
962 # The CloudFormation ``AWS::SageMaker::Domain`` resource does
963 # not accept that property (only ``AWS::SageMaker::UserProfile``
964 # does), so a property override fails early validation with
965 # ``Unsupported property [CanvasAppSettings]``. Canvas uses its
966 # own default workspace artifact locations; operators who want
967 # to pin per-user Canvas defaults can apply
968 # ``CanvasAppSettings`` at the ``UserProfile`` level directly.
970 # The domain validates that the EFS file system has mount targets in
971 # every subnet before stabilizing. CDK doesn't infer this dependency
972 # from the file_system_id reference alone, so we add it explicitly.
973 self.studio_domain.node.add_dependency(self.studio_efs)
975 CfnOutput(
976 self,
977 "StudioDomainName",
978 value=self.studio_domain.domain_name or "",
979 description="Name of the SageMaker Studio domain",
980 )
982 # Cleanup custom resource — on stack deletion, removes all user
983 # profiles from the domain and all access points from the EFS so
984 # CloudFormation can delete the domain and file system cleanly.
985 from aws_cdk import CustomResource
986 from aws_cdk import custom_resources as cr_provider
988 cleanup_fn = lambda_.Function(
989 self,
990 "CleanupFunction",
991 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),
992 handler="handler.handler",
993 code=lambda_.Code.from_asset("lambda/analytics-cleanup"),
994 # 15 minutes covers the worst case of multiple async drain
995 # loops in series: apps (up to ~2 min), spaces (up to ~3 min),
996 # user profiles (up to ~3 min), SageMaker-managed EFS mount
997 # targets (up to ~2 min), plus incidental RPC latency and
998 # security-group cleanup. In the common case (a handful of
999 # users) this finishes in well under a minute.
1000 timeout=Duration.minutes(15),
1001 environment={
1002 "DOMAIN_ID": self.studio_domain.attr_domain_id,
1003 "EFS_ID": self.studio_efs.file_system_id,
1004 "REGION": self.region,
1005 "VPC_ID": self.vpc.vpc_id,
1006 },
1007 )
1009 # Use a customer-managed policy instead of an inline policy.
1010 # Inline policies (created by add_to_role_policy) are separate
1011 # CloudFormation resources that can be deleted before the custom
1012 # resource fires during stack deletion. A managed policy attached
1013 # via the role's managedPolicies property is part of the role
1014 # resource itself and persists until the role is deleted.
1015 cleanup_policy = iam.ManagedPolicy(
1016 self,
1017 "CleanupFunctionPolicy",
1018 statements=[
1019 iam.PolicyStatement(
1020 effect=iam.Effect.ALLOW,
1021 actions=[
1022 "sagemaker:ListApps",
1023 "sagemaker:DeleteApp",
1024 "sagemaker:ListSpaces",
1025 "sagemaker:DeleteSpace",
1026 "sagemaker:ListUserProfiles",
1027 "sagemaker:DeleteUserProfile",
1028 "sagemaker:DescribeDomain",
1029 "elasticfilesystem:DescribeAccessPoints",
1030 "elasticfilesystem:DeleteAccessPoint",
1031 "elasticfilesystem:DescribeFileSystems",
1032 "elasticfilesystem:DescribeMountTargets",
1033 "elasticfilesystem:DeleteMountTarget",
1034 "elasticfilesystem:DeleteFileSystem",
1035 "elasticfilesystem:DeleteFileSystemPolicy",
1036 "ec2:DescribeSecurityGroups",
1037 "ec2:DeleteSecurityGroup",
1038 "ec2:RevokeSecurityGroupIngress",
1039 "ec2:RevokeSecurityGroupEgress",
1040 ],
1041 resources=["*"],
1042 )
1043 ],
1044 )
1045 assert cleanup_fn.role is not None
1046 cleanup_fn.role.add_managed_policy(cleanup_policy)
1048 cleanup_provider = cr_provider.Provider(
1049 self,
1050 "CleanupProvider",
1051 on_event_handler=cleanup_fn,
1052 )
1054 cleanup_resource = CustomResource(
1055 self,
1056 "DomainCleanup",
1057 service_token=cleanup_provider.service_token,
1058 )
1060 # The managed policy must not be deleted until after the cleanup
1061 # custom resource completes. Adding a dependency ensures
1062 # CloudFormation keeps the policy alive during the Lambda execution.
1063 cleanup_resource.node.add_dependency(cleanup_policy)
1065 # Store reference so _create_presigned_url_lambda can add a
1066 # dependency after it creates the presigned-URL Lambda.
1067 self._cleanup_resource = cleanup_resource
1069 # Nag suppression for the cleanup Lambda — Resource::* is required
1070 # because ListUserProfiles/DeleteUserProfile and
1071 # DescribeAccessPoints/DeleteAccessPoint don't support resource-level
1072 # scoping (the domain ID and EFS ID are passed via env vars, not ARNs).
1073 from cdk_nag import NagSuppressions
1075 assert cleanup_fn.role is not None # always set for non-imported functions
1076 NagSuppressions.add_resource_suppressions(
1077 cleanup_fn.role,
1078 [
1079 {
1080 "id": "AwsSolutions-IAM5",
1081 "reason": (
1082 "Cleanup Lambda needs Resource::* for "
1083 "sagemaker:ListUserProfiles/DeleteUserProfile and "
1084 "efs:DescribeAccessPoints/DeleteAccessPoint. These "
1085 "APIs don't support resource-level scoping. The "
1086 "Lambda only runs on stack deletion and is scoped "
1087 "to the domain ID and EFS ID via environment variables."
1088 ),
1089 "appliesTo": ["Resource::*"],
1090 },
1091 {
1092 "id": "AwsSolutions-IAM4",
1093 "reason": (
1094 "Cleanup Lambda uses AWSLambdaBasicExecutionRole "
1095 "managed policy for CloudWatch Logs access."
1096 ),
1097 "appliesTo": [
1098 "Policy::arn:<AWS::Partition>:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole",
1099 ],
1100 },
1101 ],
1102 apply_to_children=True,
1103 )
1104 NagSuppressions.add_resource_suppressions(
1105 cleanup_provider,
1106 [
1107 {
1108 "id": "AwsSolutions-IAM5",
1109 "reason": (
1110 "CDK Provider framework uses Resource::* for its "
1111 "internal Lambda invocation policy."
1112 ),
1113 "appliesTo": [
1114 "Resource::*",
1115 {"regex": "/^Resource::<CleanupFunction.*\\.Arn>:\\*$/"},
1116 ],
1117 },
1118 {
1119 "id": "AwsSolutions-IAM4",
1120 "reason": ("CDK Provider framework uses AWSLambdaBasicExecutionRole."),
1121 "appliesTo": [
1122 "Policy::arn:<AWS::Partition>:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole",
1123 ],
1124 },
1125 {
1126 "id": "AwsSolutions-L1",
1127 "reason": ("CDK Provider framework manages its own Lambda runtime version."),
1128 },
1129 ],
1130 apply_to_children=True,
1131 )
1133 # ==================================================================
1134 # EMR Serverless application
1135 # ==================================================================
1137 def _create_emr_app(self) -> None:
1138 """Create an EMR Serverless Spark application on the private VPC.
1140 Pinned ``release_label`` lives in
1141 ``gco.stacks.constants.EMR_SERVERLESS_RELEASE_LABEL`` so analytics
1142 workloads get a reproducible Spark runtime across deployments. The
1143 application's network configuration uses the private
1144 subnets + a dedicated security group so Spark workers stay on the
1145 same network perimeter as the Studio notebooks.
1146 """
1147 private_subnet_ids = [
1148 s.subnet_id
1149 for s in self.vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS).subnets
1150 ]
1152 self.emr_security_group = ec2.SecurityGroup(
1153 self,
1154 "EmrServerlessSecurityGroup",
1155 vpc=self.vpc,
1156 description="SG for EMR Serverless Spark workers",
1157 allow_all_outbound=True,
1158 )
1160 self.emr_app = emrserverless.CfnApplication(
1161 self,
1162 "EmrServerlessApp",
1163 name=f"gco-spark-{self.region}",
1164 release_label=EMR_SERVERLESS_RELEASE_LABEL,
1165 type="SPARK",
1166 network_configuration=emrserverless.CfnApplication.NetworkConfigurationProperty(
1167 subnet_ids=private_subnet_ids,
1168 security_group_ids=[self.emr_security_group.security_group_id],
1169 ),
1170 )
1172 # ==================================================================
1173 # Cognito pool + client + domain
1174 # ==================================================================
1176 def _create_cognito_pool(self) -> None:
1177 """Create the Cognito user pool that authenticates SageMaker Studio logins.
1179 Password policy, standard threat-protection mode, and self-sign-up-
1180 disabled flags are configured for SRP-backed Studio logins. The
1181 attached ``UserPoolClient`` runs SRP auth
1182 (used by ``gco analytics studio login``) with token revocation
1183 enabled. The ``UserPoolDomain`` uses the configurable prefix from
1184 ``analytics_environment.cognito.domain_prefix`` or defaults to
1185 ``gco-studio-<account>``.
1186 """
1187 self.cognito_pool = cognito.UserPool(
1188 self,
1189 "StudioUserPool",
1190 self_sign_up_enabled=False,
1191 password_policy=cognito.PasswordPolicy(
1192 min_length=12,
1193 require_digits=True,
1194 require_symbols=True,
1195 require_uppercase=True,
1196 require_lowercase=True,
1197 ),
1198 sign_in_aliases=cognito.SignInAliases(username=True),
1199 auto_verify=cognito.AutoVerifiedAttrs(email=True),
1200 # Replaces the deprecated ``advanced_security_mode`` kwarg
1201 # (aws-cdk-lib's AdvancedSecurityMode enum is gone as of the
1202 # Cognito November 2024 tier changes). Lite feature plan — the
1203 # default — does not support real threat protection, so we set
1204 # ``NO_ENFORCEMENT`` here to keep the synth warning-free.
1205 # TODO: operators who want real threat protection should opt
1206 # into the Essentials or Plus feature plan by also setting
1207 # ``feature_plan=cognito.FeaturePlan.ESSENTIALS`` (or
1208 # ``FeaturePlan.PLUS``) and flipping this to
1209 # ``StandardThreatProtectionMode.FULL_FUNCTION``. That path
1210 # changes the per-MAU price — see the Cognito pricing doc —
1211 # which is why the default stays on Lite+NO_ENFORCEMENT.
1212 standard_threat_protection_mode=(cognito.StandardThreatProtectionMode.NO_ENFORCEMENT),
1213 removal_policy=self.cognito_removal,
1214 )
1216 self.cognito_client = self.cognito_pool.add_client(
1217 "StudioUserPoolClient",
1218 auth_flows=cognito.AuthFlow(
1219 user_srp=True,
1220 admin_user_password=True,
1221 ),
1222 prevent_user_existence_errors=True,
1223 enable_token_revocation=True,
1224 )
1226 # Domain prefix — default is ``gco-studio-<account>`` (stock default
1227 # from constants.COGNITO_DOMAIN_PREFIX_DEFAULT + account suffix).
1228 # The override in cdk.json is used verbatim when non-None, without
1229 # appending the account id, because operators who override the
1230 # prefix typically want a short memorable value.
1231 if self._cognito_domain_prefix_override: 1231 ↛ 1232line 1231 didn't jump to line 1232 because the condition on line 1231 was never true
1232 domain_prefix = self._cognito_domain_prefix_override
1233 else:
1234 domain_prefix = f"{COGNITO_DOMAIN_PREFIX_DEFAULT}-{self.account}"
1236 self.cognito_domain = self.cognito_pool.add_domain(
1237 "StudioUserPoolDomain",
1238 cognito_domain=cognito.CognitoDomainOptions(domain_prefix=domain_prefix),
1239 )
1241 CfnOutput(
1242 self,
1243 "CognitoUserPoolId",
1244 value=self.cognito_pool.user_pool_id,
1245 description="ID of the Cognito user pool that gates SageMaker Studio",
1246 )
1247 CfnOutput(
1248 self,
1249 "CognitoUserPoolArn",
1250 value=self.cognito_pool.user_pool_arn,
1251 description="ARN of the Cognito user pool",
1252 )
1253 CfnOutput(
1254 self,
1255 "CognitoUserPoolClientId",
1256 value=self.cognito_client.user_pool_client_id,
1257 description="Client ID used by the GCO CLI for SRP auth",
1258 )
1260 # ==================================================================
1261 # Presigned-URL Lambda
1262 # ==================================================================
1264 def _create_presigned_url_lambda(self) -> None:
1265 """Create the ``Presigned_URL_Lambda`` that mints Studio login URLs.
1267 Wired into API Gateway's ``/studio/login`` route from
1268 ``GCOApiGatewayGlobalStack``. The function lives on
1269 ``GCOAnalyticsStack`` (not the API gateway stack) so its IAM role
1270 can reference ``SageMaker_Execution_Role.role_arn`` on ``PassRole``
1271 and ``Studio_EFS.file_system_arn`` on the EFS access-point actions
1272 without a cross-stack import.
1274 Key configuration:
1276 * Runtime: ``LAMBDA_PYTHON_RUNTIME`` from ``gco.stacks.constants``.
1277 * Timeout: 29 s — API Gateway's maximum integration timeout is 29
1278 seconds, so matching it here lets the Lambda time out *before*
1279 API Gateway does, producing a clean HTTP 500 with our opaque
1280 error token rather than API Gateway's 504.
1281 * Tracing: ``ACTIVE`` so X-Ray captures the
1282 ``sagemaker:CreatePresignedDomainUrl`` call.
1283 * Log group retention: 1 month.
1285 IAM scoping:
1287 * ``sagemaker:ListDomains`` — no resource-level scoping available;
1288 scoped with a documented ``Resource::*`` nag suppression.
1289 * ``sagemaker:DescribeDomain`` + ``CreatePresignedDomainUrl`` +
1290 ``DescribeUserProfile`` + ``CreateUserProfile`` + ``ListTags`` +
1291 ``AddTags`` scoped to the domain and user-profile ARN families
1292 in this region+account. We cannot pin the ``DomainId`` at synth
1293 time because ``list_domains`` runs at invoke time, so the ARN
1294 shape includes a wildcard segment covering "any domain id".
1295 * ``iam:PassRole`` on ``SageMaker_Execution_Role.role_arn`` with a
1296 ``StringEquals iam:PassedToService=sagemaker.amazonaws.com``
1297 condition so the role can only ever be handed to SageMaker.
1298 * ``elasticfilesystem:DescribeAccessPoints`` +
1299 ``CreateAccessPoint`` on ``Studio_EFS.file_system_arn`` for the
1300 lazy per-user access-point creation path in the handler.
1301 * ``AWSLambdaBasicExecutionRole`` managed policy for the CloudWatch
1302 Logs + X-Ray write path.
1303 """
1304 from cdk_nag import NagSuppressions
1306 # Dedicated IAM role — narrow-scoped, no reuse across other
1307 # Lambdas. We attach the basic execution role as a managed policy
1308 # so the nag rule for ``AwsSolutions-IAM4`` is happy; everything
1309 # else is an inline policy we own entirely.
1310 self.presigned_url_lambda_role = iam.Role(
1311 self,
1312 "PresignedUrlLambdaRole",
1313 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
1314 description=(
1315 "Execution role for the analytics presigned-URL Lambda. "
1316 "Scoped to SageMaker domain + user-profile operations, "
1317 "PassRole on SageMaker_Execution_Role, and EFS access-"
1318 "point management on Studio_EFS."
1319 ),
1320 managed_policies=[
1321 iam.ManagedPolicy.from_aws_managed_policy_name(
1322 "service-role/AWSLambdaBasicExecutionRole"
1323 )
1324 ],
1325 )
1327 # ListDomains does not support resource-level scoping (AWS API
1328 # constraint). We use Resource::* and document the effective
1329 # blast radius in the nag suppression below — one list call per
1330 # invocation against the region's SageMaker control plane.
1331 self.presigned_url_lambda_role.add_to_policy(
1332 iam.PolicyStatement(
1333 effect=iam.Effect.ALLOW,
1334 actions=["sagemaker:ListDomains"],
1335 resources=["*"],
1336 )
1337 )
1339 # Domain + user-profile actions. At synth time we don't know the
1340 # DomainId (list_domains is an invoke-time call), so the ARN
1341 # wildcards cover "any domain in this region+account" and "any
1342 # user profile under any domain in this region+account". The
1343 # account is still pinned, so the blast radius is bounded to
1344 # this account's SageMaker Studio installation.
1345 domain_arn_prefix = f"arn:aws:sagemaker:{self.region}:{self.account}:domain/*"
1346 user_profile_arn_prefix = f"arn:aws:sagemaker:{self.region}:{self.account}:user-profile/*/*"
1347 self.presigned_url_lambda_role.add_to_policy(
1348 iam.PolicyStatement(
1349 effect=iam.Effect.ALLOW,
1350 actions=[
1351 "sagemaker:DescribeDomain",
1352 "sagemaker:CreatePresignedDomainUrl",
1353 "sagemaker:DescribeUserProfile",
1354 "sagemaker:CreateUserProfile",
1355 "sagemaker:ListTags",
1356 "sagemaker:AddTags",
1357 ],
1358 resources=[domain_arn_prefix, user_profile_arn_prefix],
1359 )
1360 )
1362 # iam:PassRole — only SageMaker_Execution_Role, only to
1363 # sagemaker.amazonaws.com. This is what CreateUserProfile passes
1364 # on the ``ExecutionRole`` field.
1365 self.presigned_url_lambda_role.add_to_policy(
1366 iam.PolicyStatement(
1367 effect=iam.Effect.ALLOW,
1368 actions=["iam:PassRole"],
1369 resources=[self.sagemaker_execution_role.role_arn],
1370 conditions={
1371 "StringEquals": {
1372 "iam:PassedToService": "sagemaker.amazonaws.com",
1373 }
1374 },
1375 )
1376 )
1378 # EFS access-point management — scoped to the Studio_EFS file
1379 # system. The Lambda creates one access point per Cognito user
1380 # at first login (lazy-in-Lambda approach).
1381 self.presigned_url_lambda_role.add_to_policy(
1382 iam.PolicyStatement(
1383 effect=iam.Effect.ALLOW,
1384 actions=[
1385 "elasticfilesystem:DescribeAccessPoints",
1386 "elasticfilesystem:CreateAccessPoint",
1387 "elasticfilesystem:TagResource",
1388 ],
1389 resources=[self.studio_efs.file_system_arn],
1390 )
1391 )
1393 # CloudWatch log group with 1-month retention. We own
1394 # the group explicitly (rather than letting Lambda auto-create
1395 # one) so the retention setting is captured in the template.
1396 presigned_url_log_group = logs.LogGroup(
1397 self,
1398 "PresignedUrlLambdaLogGroup",
1399 retention=logs.RetentionDays.ONE_MONTH,
1400 removal_policy=RemovalPolicy.DESTROY,
1401 )
1403 self.presigned_url_lambda = lambda_.Function(
1404 self,
1405 "PresignedUrlFunction",
1406 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),
1407 handler="handler.lambda_handler",
1408 code=lambda_.Code.from_asset("lambda/analytics-presigned-url"),
1409 role=self.presigned_url_lambda_role,
1410 timeout=Duration.seconds(29),
1411 memory_size=256,
1412 tracing=lambda_.Tracing.ACTIVE,
1413 log_group=presigned_url_log_group,
1414 description=(
1415 "Exchanges a Cognito-authorized event for a presigned "
1416 "SageMaker Studio URL. Wired into /studio/login by "
1417 "GCOApiGatewayGlobalStack."
1418 ),
1419 environment={
1420 "STUDIO_DOMAIN_ID": self.studio_domain.attr_domain_id,
1421 "SAGEMAKER_EXECUTION_ROLE_ARN": self.sagemaker_execution_role.role_arn,
1422 "STUDIO_EFS_ID": self.studio_efs.file_system_id,
1423 "URL_EXPIRES_SECONDS": "300",
1424 "SESSION_EXPIRES_SECONDS": "43200",
1425 },
1426 )
1428 CfnOutput(
1429 self,
1430 "PresignedUrlLambdaArn",
1431 value=self.presigned_url_lambda.function_arn,
1432 description=(
1433 "ARN of the presigned-URL Lambda - consumed by the API "
1434 "Gateway stack's /studio/login integration."
1435 ),
1436 )
1438 # Nag suppressions. Each one carries a literal-ARN or documented
1439 # wildcard ``applies_to`` and a ``reason`` string explaining why
1440 # tighter scoping isn't possible.
1441 NagSuppressions.add_resource_suppressions(
1442 self.presigned_url_lambda_role,
1443 [
1444 {
1445 "id": "AwsSolutions-IAM5",
1446 "reason": (
1447 "sagemaker:ListDomains does not support resource-"
1448 "level scoping — the AWS API only accepts "
1449 "Resource: *. Effective blast radius: a single "
1450 "paginated list call per Lambda invocation "
1451 "against this account's SageMaker control plane "
1452 "in this region. The remaining SageMaker actions "
1453 "(DescribeDomain, CreatePresignedDomainUrl, "
1454 "DescribeUserProfile, CreateUserProfile, "
1455 "ListTags, AddTags) are scoped to the literal "
1456 "arn:aws:sagemaker:<region>:<account>:domain/* "
1457 "and arn:aws:sagemaker:<region>:<account>:"
1458 "user-profile/*/* ARN families, which is the "
1459 "tightest we can achieve at synth time because "
1460 "DomainId is only resolvable at invoke time."
1461 ),
1462 "appliesTo": [
1463 "Resource::*",
1464 ("Resource::arn:aws:sagemaker:<AWS::Region>:<AWS::AccountId>:domain/*"),
1465 (
1466 "Resource::arn:aws:sagemaker:<AWS::Region>:"
1467 "<AWS::AccountId>:user-profile/*/*"
1468 ),
1469 ],
1470 },
1471 ],
1472 apply_to_children=True,
1473 )
1475 # The cleanup custom resource must fire AFTER the presigned-URL
1476 # Lambda is deleted during stack destruction. Otherwise the Lambda
1477 # can recreate user profiles (via in-flight login requests) between
1478 # cleanup and domain deletion. Adding the dependency here (after
1479 # the Lambda is created) ensures correct deletion ordering.
1480 self._cleanup_resource.node.add_dependency(self.presigned_url_lambda)
1482 # ==================================================================
1483 # Nag suppressions
1484 # ==================================================================
1486 def _apply_nag_suppressions(self) -> None:
1487 """Dispatch to the analytics branch in ``gco/stacks/nag_suppressions.py``.
1489 The analytics branch calls ``add_sagemaker_suppressions``,
1490 ``add_cognito_suppressions``, ``add_emr_serverless_suppressions``,
1491 ``add_storage_suppressions`` (for ``Studio_Only_Bucket`` + access-
1492 logs bucket), ``add_lambda_suppressions`` (for the presigned-URL
1493 Lambda provider framework), and ``add_iam_suppressions`` (for
1494 cross-region SSM reads + CDK custom resources).
1495 """
1496 apply_all_suppressions(
1497 self,
1498 stack_type="analytics",
1499 regions=None,
1500 global_region=self.config.get_global_region(),
1501 api_gateway_region=self.config.get_api_gateway_region(),
1502 )