Management & Operations
How to implement monitoring, logging, automation, and operational excellence in landing zones.
Operations Architecture
Log Analytics Design
Workspace Strategy
| Pattern | Description | Best For |
|---|---|---|
| Centralized | Single workspace for all | Small-medium orgs |
| Decentralized | Workspace per team/app | Large orgs, data sovereignty |
| Hybrid | Platform + application workspaces | Enterprise (recommended) |
Recommended Architecture
Bicep: Log Analytics Workspace
resource logAnalytics 'Microsoft.OperationalInsights/workspaces@2022-10-01' = {
name: 'log-platform-prod-001'
location: location
properties: {
sku: {
name: 'PerGB2018'
}
retentionInDays: 730 // 2 years for security
features: {
enableLogAccessUsingOnlyResourcePermissions: true
}
workspaceCapping: {
dailyQuotaGb: 100
}
}
}
// Solutions
resource securitySolution 'Microsoft.OperationsManagement/solutions@2015-11-01-preview' = {
name: 'Security(${logAnalytics.name})'
location: location
properties: {
workspaceResourceId: logAnalytics.id
}
plan: {
name: 'Security(${logAnalytics.name})'
publisher: 'Microsoft'
product: 'OMSGallery/Security'
promotionCode: ''
}
}
resource vmInsightsSolution 'Microsoft.OperationsManagement/solutions@2015-11-01-preview' = {
name: 'VMInsights(${logAnalytics.name})'
location: location
properties: {
workspaceResourceId: logAnalytics.id
}
plan: {
name: 'VMInsights(${logAnalytics.name})'
publisher: 'Microsoft'
product: 'OMSGallery/VMInsights'
promotionCode: ''
}
}
Azure Monitor Configuration
Data Collection Rules
resource dataCollectionRule 'Microsoft.Insights/dataCollectionRules@2022-06-01' = {
name: 'dcr-windows-vm'
location: location
properties: {
dataSources: {
performanceCounters: [
{
name: 'perfCounterDataSource'
streams: ['Microsoft-Perf']
samplingFrequencyInSeconds: 60
counterSpecifiers: [
'\\Processor Information(_Total)\\% Processor Time'
'\\Memory\\Available Bytes'
'\\LogicalDisk(_Total)\\% Free Space'
'\\LogicalDisk(_Total)\\Avg. Disk sec/Read'
'\\LogicalDisk(_Total)\\Avg. Disk sec/Write'
]
}
]
windowsEventLogs: [
{
name: 'eventLogsDataSource'
streams: ['Microsoft-Event']
xPathQueries: [
'Application!*[System[(Level=1 or Level=2 or Level=3)]]'
'System!*[System[(Level=1 or Level=2 or Level=3)]]'
'Security!*[System[(band(Keywords,13510798882111488))]]'
]
}
]
}
destinations: {
logAnalytics: [
{
workspaceResourceId: logAnalytics.id
name: 'centralWorkspace'
}
]
}
dataFlows: [
{
streams: ['Microsoft-Perf', 'Microsoft-Event']
destinations: ['centralWorkspace']
}
]
}
}
Diagnostic Settings for PaaS
resource diagnosticSetting 'Microsoft.Insights/diagnosticSettings@2021-05-01-preview' = {
name: 'send-to-log-analytics'
scope: storageAccount
properties: {
workspaceId: logAnalytics.id
logs: [
{
categoryGroup: 'allLogs'
enabled: true
retentionPolicy: {
enabled: false
days: 0
}
}
]
metrics: [
{
category: 'AllMetrics'
enabled: true
retentionPolicy: {
enabled: false
days: 0
}
}
]
}
}
Alert Configuration
Alert Categories
Bicep: Alert Rules
// Action Group
resource actionGroup 'Microsoft.Insights/actionGroups@2023-01-01' = {
name: 'ag-platform-critical'
location: 'global'
properties: {
groupShortName: 'PlatCrit'
enabled: true
emailReceivers: [
{
name: 'Platform Team'
emailAddress: 'platform-team@contoso.com'
useCommonAlertSchema: true
}
]
smsReceivers: [
{
name: 'On-Call'
countryCode: '1'
phoneNumber: '5551234567'
}
]
webhookReceivers: [
{
name: 'PagerDuty'
serviceUri: 'https://events.pagerduty.com/integration/xxx/enqueue'
useCommonAlertSchema: true
}
]
}
}
// VM CPU Alert
resource cpuAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: 'alert-vm-cpu-high'
location: 'global'
properties: {
description: 'CPU utilization above 90% for 5 minutes'
severity: 2
enabled: true
scopes: [
subscription().id
]
evaluationFrequency: 'PT1M'
windowSize: 'PT5M'
targetResourceType: 'Microsoft.Compute/virtualMachines'
targetResourceRegion: location
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: 'HighCPU'
metricName: 'Percentage CPU'
dimensions: []
operator: 'GreaterThan'
threshold: 90
timeAggregation: 'Average'
criterionType: 'StaticThresholdCriterion'
}
]
}
actions: [
{
actionGroupId: actionGroup.id
}
]
}
}
// Service Health Alert
resource serviceHealthAlert 'Microsoft.Insights/activityLogAlerts@2020-10-01' = {
name: 'alert-service-health'
location: 'global'
properties: {
enabled: true
scopes: [
subscription().id
]
condition: {
allOf: [
{
field: 'category'
equals: 'ServiceHealth'
}
{
field: 'properties.incidentType'
equals: 'Incident'
}
]
}
actions: {
actionGroups: [
{
actionGroupId: actionGroup.id
}
]
}
}
}
Update Management
Azure Update Manager
Maintenance Configuration
resource maintenanceConfig 'Microsoft.Maintenance/maintenanceConfigurations@2023-04-01' = {
name: 'mc-prod-windows-weekly'
location: location
properties: {
maintenanceScope: 'InGuestPatch'
installPatches: {
windowsParameters: {
classificationsToInclude: [
'Critical'
'Security'
]
excludeKbsRequiringReboot: false
}
rebootSetting: 'IfRequired'
}
maintenanceWindow: {
startDateTime: '2024-01-06 02:00'
duration: '03:00'
timeZone: 'UTC'
recurEvery: 'Week Saturday'
}
extensionProperties: {
InGuestPatchMode: 'User'
}
}
}
Azure Automation
Automation Architecture
Sample: Auto-Stop VMs Runbook
# Runbook: Stop-VMs-ByTag.ps1
param(
[Parameter(Mandatory=$true)]
[string]$TagName,
[Parameter(Mandatory=$true)]
[string]$TagValue,
[Parameter(Mandatory=$false)]
[string]$SubscriptionId
)
# Authenticate using managed identity
Connect-AzAccount -Identity
if ($SubscriptionId) {
Set-AzContext -SubscriptionId $SubscriptionId
}
# Get VMs with the specified tag
$vms = Get-AzVM | Where-Object {
$_.Tags[$TagName] -eq $TagValue
}
foreach ($vm in $vms) {
$vmStatus = Get-AzVM -ResourceGroupName $vm.ResourceGroupName -Name $vm.Name -Status
$powerState = ($vmStatus.Statuses | Where-Object { $_.Code -like 'PowerState/*' }).Code
if ($powerState -eq 'PowerState/running') {
Write-Output "Stopping VM: $($vm.Name)"
Stop-AzVM -ResourceGroupName $vm.ResourceGroupName -Name $vm.Name -Force
}
else {
Write-Output "VM $($vm.Name) is already stopped"
}
}
Bicep: Automation Account
resource automationAccount 'Microsoft.Automation/automationAccounts@2022-08-08' = {
name: 'aa-platform-prod-001'
location: location
identity: {
type: 'SystemAssigned'
}
properties: {
sku: {
name: 'Basic'
}
encryption: {
keySource: 'Microsoft.Automation'
}
}
}
resource schedule 'Microsoft.Automation/automationAccounts/schedules@2022-08-08' = {
parent: automationAccount
name: 'schedule-stop-dev-vms'
properties: {
description: 'Stop dev VMs at 7 PM'
startTime: '2024-01-01T19:00:00+00:00'
frequency: 'Day'
interval: 1
timeZone: 'UTC'
}
}
Workbooks & Dashboards
Platform Workbook Structure
Sample: VM Performance Query
// VM Performance Overview
Perf
| where TimeGenerated > ago(24h)
| where ObjectName == "Processor" and CounterName == "% Processor Time"
| summarize AvgCPU = avg(CounterValue) by Computer, bin(TimeGenerated, 1h)
| render timechart
// Memory Pressure
Perf
| where TimeGenerated > ago(24h)
| where ObjectName == "Memory" and CounterName == "% Committed Bytes In Use"
| summarize AvgMemory = avg(CounterValue) by Computer, bin(TimeGenerated, 1h)
| where AvgMemory > 80
// Disk Space Low
Perf
| where TimeGenerated > ago(1h)
| where ObjectName == "LogicalDisk" and CounterName == "% Free Space"
| summarize MinFreeSpace = min(CounterValue) by Computer, InstanceName
| where MinFreeSpace < 20
Azure Arc
Arc-Enabled Servers
Arc Extension Deployment
resource arcServerExtension 'Microsoft.HybridCompute/machines/extensions@2023-06-20-preview' = {
name: '${arcMachineName}/AzureMonitorWindowsAgent'
location: location
properties: {
publisher: 'Microsoft.Azure.Monitor'
type: 'AzureMonitorWindowsAgent'
autoUpgradeMinorVersion: true
}
}
Operations Checklist
✅ Monitoring
- Log Analytics workspace deployed
- Data collection rules configured
- Diagnostic settings on all resources
- VM Insights enabled
- Application Insights for apps
✅ Alerting
- Action groups configured
- Platform health alerts
- Security alerts
- Cost budget alerts
- Service health alerts
✅ Automation
- Automation account deployed
- Start/stop schedules for non-prod
- Compliance runbooks
- Maintenance configurations
✅ Patching
- Update Manager configured
- Maintenance windows defined
- Production vs non-prod schedules
- Compliance reporting
Quick Reference Card
| Component | Configuration |
|---|---|
| Log Analytics | Centralized for security, per-app for workloads |
| Retention | 2 years security, 90 days application |
| Alerts | Critical = 1, Important = 2, Warning = 3 |
| Patching | Weekly non-prod, monthly prod |
| Automation | Managed identity for auth |
| Workbooks | One per domain (compute, network, security) |
Next Steps
Continue to Business Continuity to learn about backup, disaster recovery, and availability.