Skip to main content

Management & Operations

How to implement monitoring, logging, automation, and operational excellence in landing zones.

Operations Architecture

Log Analytics Design

Workspace Strategy

PatternDescriptionBest For
CentralizedSingle workspace for allSmall-medium orgs
DecentralizedWorkspace per team/appLarge orgs, data sovereignty
HybridPlatform + application workspacesEnterprise (recommended)

Bicep: Log Analytics Workspace

resource logAnalytics 'Microsoft.OperationalInsights/workspaces@2022-10-01' = {
name: 'log-platform-prod-001'
location: location
properties: {
sku: {
name: 'PerGB2018'
}
retentionInDays: 730 // 2 years for security
features: {
enableLogAccessUsingOnlyResourcePermissions: true
}
workspaceCapping: {
dailyQuotaGb: 100
}
}
}

// Solutions
resource securitySolution 'Microsoft.OperationsManagement/solutions@2015-11-01-preview' = {
name: 'Security(${logAnalytics.name})'
location: location
properties: {
workspaceResourceId: logAnalytics.id
}
plan: {
name: 'Security(${logAnalytics.name})'
publisher: 'Microsoft'
product: 'OMSGallery/Security'
promotionCode: ''
}
}

resource vmInsightsSolution 'Microsoft.OperationsManagement/solutions@2015-11-01-preview' = {
name: 'VMInsights(${logAnalytics.name})'
location: location
properties: {
workspaceResourceId: logAnalytics.id
}
plan: {
name: 'VMInsights(${logAnalytics.name})'
publisher: 'Microsoft'
product: 'OMSGallery/VMInsights'
promotionCode: ''
}
}

Azure Monitor Configuration

Data Collection Rules

resource dataCollectionRule 'Microsoft.Insights/dataCollectionRules@2022-06-01' = {
name: 'dcr-windows-vm'
location: location
properties: {
dataSources: {
performanceCounters: [
{
name: 'perfCounterDataSource'
streams: ['Microsoft-Perf']
samplingFrequencyInSeconds: 60
counterSpecifiers: [
'\\Processor Information(_Total)\\% Processor Time'
'\\Memory\\Available Bytes'
'\\LogicalDisk(_Total)\\% Free Space'
'\\LogicalDisk(_Total)\\Avg. Disk sec/Read'
'\\LogicalDisk(_Total)\\Avg. Disk sec/Write'
]
}
]
windowsEventLogs: [
{
name: 'eventLogsDataSource'
streams: ['Microsoft-Event']
xPathQueries: [
'Application!*[System[(Level=1 or Level=2 or Level=3)]]'
'System!*[System[(Level=1 or Level=2 or Level=3)]]'
'Security!*[System[(band(Keywords,13510798882111488))]]'
]
}
]
}
destinations: {
logAnalytics: [
{
workspaceResourceId: logAnalytics.id
name: 'centralWorkspace'
}
]
}
dataFlows: [
{
streams: ['Microsoft-Perf', 'Microsoft-Event']
destinations: ['centralWorkspace']
}
]
}
}

Diagnostic Settings for PaaS

resource diagnosticSetting 'Microsoft.Insights/diagnosticSettings@2021-05-01-preview' = {
name: 'send-to-log-analytics'
scope: storageAccount
properties: {
workspaceId: logAnalytics.id
logs: [
{
categoryGroup: 'allLogs'
enabled: true
retentionPolicy: {
enabled: false
days: 0
}
}
]
metrics: [
{
category: 'AllMetrics'
enabled: true
retentionPolicy: {
enabled: false
days: 0
}
}
]
}
}

Alert Configuration

Alert Categories

Bicep: Alert Rules

// Action Group
resource actionGroup 'Microsoft.Insights/actionGroups@2023-01-01' = {
name: 'ag-platform-critical'
location: 'global'
properties: {
groupShortName: 'PlatCrit'
enabled: true
emailReceivers: [
{
name: 'Platform Team'
emailAddress: 'platform-team@contoso.com'
useCommonAlertSchema: true
}
]
smsReceivers: [
{
name: 'On-Call'
countryCode: '1'
phoneNumber: '5551234567'
}
]
webhookReceivers: [
{
name: 'PagerDuty'
serviceUri: 'https://events.pagerduty.com/integration/xxx/enqueue'
useCommonAlertSchema: true
}
]
}
}

// VM CPU Alert
resource cpuAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: 'alert-vm-cpu-high'
location: 'global'
properties: {
description: 'CPU utilization above 90% for 5 minutes'
severity: 2
enabled: true
scopes: [
subscription().id
]
evaluationFrequency: 'PT1M'
windowSize: 'PT5M'
targetResourceType: 'Microsoft.Compute/virtualMachines'
targetResourceRegion: location
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: 'HighCPU'
metricName: 'Percentage CPU'
dimensions: []
operator: 'GreaterThan'
threshold: 90
timeAggregation: 'Average'
criterionType: 'StaticThresholdCriterion'
}
]
}
actions: [
{
actionGroupId: actionGroup.id
}
]
}
}

// Service Health Alert
resource serviceHealthAlert 'Microsoft.Insights/activityLogAlerts@2020-10-01' = {
name: 'alert-service-health'
location: 'global'
properties: {
enabled: true
scopes: [
subscription().id
]
condition: {
allOf: [
{
field: 'category'
equals: 'ServiceHealth'
}
{
field: 'properties.incidentType'
equals: 'Incident'
}
]
}
actions: {
actionGroups: [
{
actionGroupId: actionGroup.id
}
]
}
}
}

Update Management

Azure Update Manager

Maintenance Configuration

resource maintenanceConfig 'Microsoft.Maintenance/maintenanceConfigurations@2023-04-01' = {
name: 'mc-prod-windows-weekly'
location: location
properties: {
maintenanceScope: 'InGuestPatch'
installPatches: {
windowsParameters: {
classificationsToInclude: [
'Critical'
'Security'
]
excludeKbsRequiringReboot: false
}
rebootSetting: 'IfRequired'
}
maintenanceWindow: {
startDateTime: '2024-01-06 02:00'
duration: '03:00'
timeZone: 'UTC'
recurEvery: 'Week Saturday'
}
extensionProperties: {
InGuestPatchMode: 'User'
}
}
}

Azure Automation

Automation Architecture

Sample: Auto-Stop VMs Runbook

# Runbook: Stop-VMs-ByTag.ps1
param(
[Parameter(Mandatory=$true)]
[string]$TagName,

[Parameter(Mandatory=$true)]
[string]$TagValue,

[Parameter(Mandatory=$false)]
[string]$SubscriptionId
)

# Authenticate using managed identity
Connect-AzAccount -Identity

if ($SubscriptionId) {
Set-AzContext -SubscriptionId $SubscriptionId
}

# Get VMs with the specified tag
$vms = Get-AzVM | Where-Object {
$_.Tags[$TagName] -eq $TagValue
}

foreach ($vm in $vms) {
$vmStatus = Get-AzVM -ResourceGroupName $vm.ResourceGroupName -Name $vm.Name -Status
$powerState = ($vmStatus.Statuses | Where-Object { $_.Code -like 'PowerState/*' }).Code

if ($powerState -eq 'PowerState/running') {
Write-Output "Stopping VM: $($vm.Name)"
Stop-AzVM -ResourceGroupName $vm.ResourceGroupName -Name $vm.Name -Force
}
else {
Write-Output "VM $($vm.Name) is already stopped"
}
}

Bicep: Automation Account

resource automationAccount 'Microsoft.Automation/automationAccounts@2022-08-08' = {
name: 'aa-platform-prod-001'
location: location
identity: {
type: 'SystemAssigned'
}
properties: {
sku: {
name: 'Basic'
}
encryption: {
keySource: 'Microsoft.Automation'
}
}
}

resource schedule 'Microsoft.Automation/automationAccounts/schedules@2022-08-08' = {
parent: automationAccount
name: 'schedule-stop-dev-vms'
properties: {
description: 'Stop dev VMs at 7 PM'
startTime: '2024-01-01T19:00:00+00:00'
frequency: 'Day'
interval: 1
timeZone: 'UTC'
}
}

Workbooks & Dashboards

Platform Workbook Structure

Sample: VM Performance Query

// VM Performance Overview
Perf
| where TimeGenerated > ago(24h)
| where ObjectName == "Processor" and CounterName == "% Processor Time"
| summarize AvgCPU = avg(CounterValue) by Computer, bin(TimeGenerated, 1h)
| render timechart

// Memory Pressure
Perf
| where TimeGenerated > ago(24h)
| where ObjectName == "Memory" and CounterName == "% Committed Bytes In Use"
| summarize AvgMemory = avg(CounterValue) by Computer, bin(TimeGenerated, 1h)
| where AvgMemory > 80

// Disk Space Low
Perf
| where TimeGenerated > ago(1h)
| where ObjectName == "LogicalDisk" and CounterName == "% Free Space"
| summarize MinFreeSpace = min(CounterValue) by Computer, InstanceName
| where MinFreeSpace < 20

Azure Arc

Arc-Enabled Servers

Arc Extension Deployment

resource arcServerExtension 'Microsoft.HybridCompute/machines/extensions@2023-06-20-preview' = {
name: '${arcMachineName}/AzureMonitorWindowsAgent'
location: location
properties: {
publisher: 'Microsoft.Azure.Monitor'
type: 'AzureMonitorWindowsAgent'
autoUpgradeMinorVersion: true
}
}

Operations Checklist

✅ Monitoring

  • Log Analytics workspace deployed
  • Data collection rules configured
  • Diagnostic settings on all resources
  • VM Insights enabled
  • Application Insights for apps

✅ Alerting

  • Action groups configured
  • Platform health alerts
  • Security alerts
  • Cost budget alerts
  • Service health alerts

✅ Automation

  • Automation account deployed
  • Start/stop schedules for non-prod
  • Compliance runbooks
  • Maintenance configurations

✅ Patching

  • Update Manager configured
  • Maintenance windows defined
  • Production vs non-prod schedules
  • Compliance reporting

Quick Reference Card

ComponentConfiguration
Log AnalyticsCentralized for security, per-app for workloads
Retention2 years security, 90 days application
AlertsCritical = 1, Important = 2, Warning = 3
PatchingWeekly non-prod, monthly prod
AutomationManaged identity for auth
WorkbooksOne per domain (compute, network, security)

Next Steps

Continue to Business Continuity to learn about backup, disaster recovery, and availability.