Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/model-engine/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.2.6
version: 0.2.7

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
16 changes: 10 additions & 6 deletions charts/model-engine/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ Create chart name and version as used by the chart label.
team: infra
app.kubernetes.io/version: {{ .Values.tag }}
tags.datadoghq.com/version: {{ .Values.tag }}
tags.datadoghq.com/env: {{ .Values.context }}
tags.datadoghq.com/env: {{ .Values.datadog.env | default .Values.context }}
env: {{ .Values.context }}
{{- if .Values.azure }}
azure.workload.identity/use: "true"
Expand Down Expand Up @@ -95,7 +95,7 @@ owner: ${OWNER}
env: {{- .Values.context | printf " %s" }}
managed-by: {{- include "modelEngine.fullname" . | printf " %s\n" -}}
use_scale_launch_endpoint_network_policy: "true"
tags.datadoghq.com/env: {{- .Values.context | printf " %s" }}
tags.datadoghq.com/env: ${DD_ENV}
tags.datadoghq.com/version: ${GIT_TAG}
{{- if .Values.azure }}
azure.workload.identity/use: "true"
Expand Down Expand Up @@ -159,7 +159,7 @@ env:
- name: DD_SERVICE
value: "${ENDPOINT_NAME}"
- name: DD_ENV
value: {{ .Values.context }}
value: "${DD_ENV}"
- name: DD_VERSION
value: "${GIT_TAG}"
- name: DD_AGENT_HOST
Expand Down Expand Up @@ -223,7 +223,7 @@ env:
- name: DD_SERVICE
value: "${ENDPOINT_NAME}"
- name: DD_ENV
value: {{ .Values.context }}
value: "${DD_ENV}"
- name: DD_VERSION
value: "${GIT_TAG}"
- name: DD_AGENT_HOST
Expand Down Expand Up @@ -296,8 +296,8 @@ env:
value: "{{ .Values.dd_trace_enabled }}"
- name: DD_REMOTE_CONFIGURATION_ENABLED
value: "false"
- name: DD_ENV
value: {{ .Values.context }}
{{- /* DD_ENV is set in the serviceEnvGitTag* wrappers: a Helm value for control-plane
pods, and the ${DD_ENV} runtime substitution for python-rendered endpoints. */}}
- name: DD_AGENT_HOST
valueFrom:
fieldRef:
Expand Down Expand Up @@ -421,6 +421,8 @@ env:

{{- define "modelEngine.serviceEnvGitTagFromHelmVar" }}
{{- include "modelEngine.serviceEnvBase" . }}
- name: DD_ENV
value: {{ .Values.datadog.env | default .Values.context }}
- name: DD_VERSION
value: {{ .Values.tag }}
- name: GIT_TAG
Expand All @@ -432,6 +434,8 @@ env:

{{- define "modelEngine.serviceEnvGitTagFromPythonReplace" }}
{{- include "modelEngine.serviceEnvBase" . }}
- name: DD_ENV
value: "${DD_ENV}"
- name: DD_VERSION
value: "${GIT_TAG}"
- name: GIT_TAG
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- if .Values.celery_autoscaler.enabled }}
{{- if not .Values.serviceIdentifier }}
{{- $app := include "modelEngine.celeryautoscalername" . }}
{{- $env := .Values.context }}
{{- $env := .Values.datadog.env | default .Values.context }}
{{- $tag := .Values.tag }}
{{- $message_broker := .Values.celeryBrokerType }}
{{- $num_shards := .Values.celery_autoscaler.num_shards }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
{{- $forwarder_repository := .Values.image.forwarderRepository -}}
{{- $triton_repository := .Values.triton.image.repository -}}
{{- $triton_tag := .Values.triton.image.tag -}}
{{- $env := .Values.context -}}
{{- $service_template_labels := include "modelEngine.serviceTemplateLabels" . }}
{{- $job_template_labels := include "modelEngine.jobTemplateLabels" . }}
{{- $service_env := include "modelEngine.serviceEnvGitTagFromPythonReplace" . }}
Expand Down Expand Up @@ -1084,7 +1083,7 @@ data:
sidecar.istio.io/inject: "false"
version: v1
annotations:
ad.datadoghq.com/main.logs: '[{"source": "python", "service": "${RESOURCE_NAME}", "tags": ["env:{{ $env }}", "launch_job_id:${JOB_ID}"]}]'
ad.datadoghq.com/main.logs: '[{"source": "python", "service": "${RESOURCE_NAME}", "tags": ["env:${DD_ENV}", "launch_job_id:${JOB_ID}"]}]'
cluster-autoscaler.kubernetes.io/safe-to-evict: "false"
spec:
restartPolicy: Never
Expand Down Expand Up @@ -1193,7 +1192,7 @@ data:
sidecar.istio.io/inject: "false"
version: v1
annotations:
ad.datadoghq.com/main.logs: '[{"source": "python", "service": "${RESOURCE_NAME}", "tags": ["env:{{ $env }}", "launch_job_id:${JOB_ID}"]}]'
ad.datadoghq.com/main.logs: '[{"source": "python", "service": "${RESOURCE_NAME}", "tags": ["env:${DD_ENV}", "launch_job_id:${JOB_ID}"]}]'
cluster-autoscaler.kubernetes.io/safe-to-evict: "false"
spec:
restartPolicy: Never
Expand Down
8 changes: 8 additions & 0 deletions charts/model-engine/values.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
dd_trace_enabled: true

# datadog [optional] configures Datadog tagging for model-engine pods.
datadog:
# env [optional] sets the Datadog `env` tag (DD_ENV + tags.datadoghq.com/env) on both
# control-plane pods and launched inference endpoints. Falls back to `context` when empty.
# Set per-cluster (e.g. "sgp-dev") so pods report the cluster's real environment.
env: ""

spellbook:
enabled: false

Expand Down
7 changes: 7 additions & 0 deletions model-engine/model_engine_server/common/env_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

__all__: Sequence[str] = (
"CIRCLECI",
"DD_ENV",
"GIT_TAG",
"LAUNCH_SERVICE_TEMPLATE_CONFIG_MAP_PATH",
"LAUNCH_SERVICE_TEMPLATE_FOLDER",
Expand Down Expand Up @@ -96,3 +97,9 @@ def get_boolean_env_var(name: str) -> bool:
GIT_TAG: str = os.environ.get("GIT_TAG", "GIT_TAG_NOT_FOUND")
if GIT_TAG == "GIT_TAG_NOT_FOUND" and "pytest" not in sys.modules:
raise ValueError("GIT_TAG environment variable must be set")

# DD_ENV is the Datadog `env` tag. It is propagated to launched inference endpoints (via the
# ${DD_ENV} template substitution) so they report the same per-cluster environment as the
# gateway, instead of the build-time `context`. Defaults to infra_config().env when the
# DD_ENV environment variable is not set on the gateway (e.g. local/CI).
DD_ENV: str = os.environ.get("DD_ENV") or infra_config().env
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from datadog import statsd
from model_engine_server.common.dtos.llms import TokenUsage
from model_engine_server.core.config import infra_config
from model_engine_server.common.env_vars import DD_ENV
from model_engine_server.domain.gateways.monitoring_metrics_gateway import (
MetricMetadata,
MonitoringMetricsGateway,
Expand All @@ -23,7 +23,7 @@ def get_model_tags(model_name: Optional[str]) -> List[str]:
class DatadogMonitoringMetricsGateway(MonitoringMetricsGateway):
def __init__(self, prefix: str = "model_engine"):
self.prefix = prefix
self.tags = [f"env:{infra_config().env}"]
self.tags = [f"env:{DD_ENV}"]

def emit_attempted_build_metric(self):
statsd.increment("scale_launch.service_builder.attempt", tags=self.tags)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from model_engine_server.common.dtos.resource_manager import CreateOrUpdateResourcesRequest
from model_engine_server.common.env_vars import (
CIRCLECI,
DD_ENV,
LAUNCH_SERVICE_TEMPLATE_CONFIG_MAP_PATH,
LAUNCH_SERVICE_TEMPLATE_FOLDER,
MODEL_CACHE_MOUNT_PATH,
Expand Down Expand Up @@ -277,6 +278,10 @@ def load_k8s_yaml(key: str, substitution_kwargs: ResourceArguments) -> Dict[str,
# K8s/container error at deploy time, rather than a KeyError deep
# inside the service-builder celery task.
filtered_kwargs = {k: v for k, v in substitution_kwargs.items() if v is not None}
# Inject the Datadog env tag for every launched resource (endpoints, batch jobs, etc.)
# so any ${DD_ENV} in labels / env vars / log configs resolves to the gateway's
# per-cluster env (set via the chart's datadog.env). setdefault lets a caller override.
filtered_kwargs.setdefault("DD_ENV", DD_ENV)
yaml_str = Template(template_str).safe_substitute(**filtered_kwargs)
try:
yaml_obj = yaml.safe_load(yaml_str)
Expand Down