1
0

initial commit

This commit is contained in:
xpk
2026-02-13 15:44:24 +08:00
parent 66be8224f4
commit 09ce4c881a
570 changed files with 61807 additions and 0 deletions
@@ -0,0 +1,50 @@
<!-- This readme file is generated with terraform-docs -->
This module installs Cloudwatch agent via SSM State Manager.
It creates an association and install the agent to all instances every 1 day.
Then a default cloudwatch agent config is generated using amazon-cloudwatch-agent-config-wizard,
saved on /opt/aws/amazon-cloudwatch-agent/bin/config.json, supplemented with additional collections,
and uploaded on SSM parameter store as ```AmazonCloudWatch-linux```.
Note that for cloudwatch agent to fully function, the instance needs an instance profile with the
following managed policies attached:
* CloudWatchAgentServerPolicy
* AmazonSSMManagedInstanceCore
## Requirements
| Name | Version |
|------|---------|
| terraform | >= 1.3.0 |
| aws | >= 5.0 |
## Providers
| Name | Version |
|------|---------|
| aws | >= 5.0 |
## Modules
No modules.
## Resources
| Name | Type |
|------|------|
| [aws_ssm_association.ConfigCwAgent](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_association) | resource |
| [aws_ssm_association.InstallCwAgent](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_association) | resource |
| [aws_ssm_parameter.CwAgentConfigLinux](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_parameter) | resource |
## Inputs
No inputs.
## Outputs
No outputs.
---
## Authorship
This module was developed by UPDATE_THIS.
@@ -0,0 +1,135 @@
resource "aws_ssm_association" "InstallCwAgent" {
name = "AWS-ConfigureAWSPackage"
association_name = "CwAgentInstall"
schedule_expression = "cron(0 00 01 ? * * *)"
max_concurrency = 10
parameters = {
name = "AmazonCloudWatchAgent"
action = "Install"
installationType = "Uninstall and reinstall"
additionalArguments = "{}"
}
targets {
key = "InstanceIds"
values = ["*"]
}
}
resource "aws_ssm_association" "ConfigCwAgent" {
name = "AmazonCloudWatch-ManageAgent"
association_name = "CwAgentConfiguration"
schedule_expression = "cron(0 00 02 ? * * *)"
max_concurrency = 10
parameters = {
action = "configure"
optionalConfigurationLocation = "AmazonCloudWatch-linux"
optionalConfigurationSource = "ssm"
mode = "ec2"
optionalRestart = "yes"
}
targets {
key = "InstanceIds"
values = ["*"]
}
}
resource "aws_ssm_parameter" "CwAgentConfigLinux" {
name = "AmazonCloudWatch-linux"
description = "Cloudwatch agent Standard config for Linux"
type = "String"
value = local.CwAgentLinuxConfig
}
locals {
CwAgentLinuxConfig = jsonencode(
{
"agent" : {
"metrics_collection_interval" : 60,
"run_as_user" : "root"
},
"metrics" : {
"aggregation_dimensions" : [
[
"InstanceId"
]
],
"append_dimensions" : {
"AutoScalingGroupName" : "$${aws:AutoScalingGroupName}",
"ImageId" : "$${aws:ImageId}",
"InstanceId" : "$${aws:InstanceId}",
"InstanceType" : "$${aws:InstanceType}"
},
"metrics_collected" : {
"cpu" : {
"measurement" : [
"cpu_usage_idle",
"cpu_usage_iowait",
"cpu_usage_user",
"cpu_usage_system"
],
"metrics_collection_interval" : 60,
"resources" : [
"*"
],
"totalcpu" : false
},
"disk" : {
"measurement" : [
"used_percent",
"inodes_free"
],
"metrics_collection_interval" : 60,
"resources" : [
"*"
],
"ignore_file_system_types" : [
"devtmpfs",
"overlay",
"sysfs",
"tmpfs"
]
},
"diskio" : {
"measurement" : [
"io_time"
],
"metrics_collection_interval" : 60,
"resources" : [
"*"
]
},
"mem" : {
"measurement" : [
"mem_used_percent"
],
"metrics_collection_interval" : 60
},
"statsd" : {
"metrics_aggregation_interval" : 60,
"metrics_collection_interval" : 10,
"service_address" : ":8125"
},
"swap" : {
"measurement" : [
"swap_used_percent"
],
"metrics_collection_interval" : 60
},
"net": {
"measurement": [
"net_err_in",
"net_err_out"
],
"metrics_collection_interval": 60
},
"processes": {
"measurement": [
"processes_total"
],
"metrics_collection_interval": 60
}
}
}
}
)
}
@@ -0,0 +1,9 @@
terraform {
required_version = ">= 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.0"
}
}
}
@@ -0,0 +1,61 @@
<!-- This readme file is generated with terraform-docs -->
This module configure CloudwatchLog and stream logs to s3 bucket via Kinesis Firehose
## Requirements
| Name | Version |
|------|---------|
| terraform | ~> 1.3.0 |
| aws | >= 5.0 |
## Providers
| Name | Version |
|------|---------|
| aws | >= 5.0 |
| random | n/a |
## Modules
No modules.
## Resources
| Name | Type |
|------|------|
| [aws_cloudwatch_log_group.firehose-log](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
| [aws_cloudwatch_log_subscription_filter.cwl-sub-filter](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_subscription_filter) | resource |
| [aws_iam_policy.cwlog-role-policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
| [aws_iam_policy.firehose-role-policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
| [aws_iam_role.cwlog-stream-role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
| [aws_iam_role.firehose-stream-iam-role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
| [aws_iam_role_policy_attachment.cwlog-role-policy-attachment](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
| [aws_iam_role_policy_attachment.firehose-role-policy-attachment](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
| [aws_kinesis_firehose_delivery_stream.cwl-s3-firehose-stream](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kinesis_firehose_delivery_stream) | resource |
| [random_id.rid](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource |
| [aws_caller_identity.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| cwl-region | AWS region where Cloudwatch LogGroup resides. Needed for setting up cwlog-stream-role | `string` | n/a | yes |
| dest-bucket-arn | Destination S3 bucket ARN | `string` | n/a | yes |
| dest-bucket-kmskey-arn | KMS key ARN for destination bucket | `string` | n/a | yes |
| dest-bucket-prefix | S3 object prefix for this stream. Please do not start with / end with a /. For example, r53-log/acme.local/ | `string` | n/a | yes |
| enable-firehose-errorlog | Enable firehose errorlog | `bool` | `false` | no |
| firehose-kmskey-arn | KMS Key arn for Firehose | `string` | n/a | yes |
| source-cwlgroup-name | Name of source CloudwatchLog group | `string` | n/a | yes |
| stream-name | Name of Kinesis Data Firehose delivery stream | `string` | n/a | yes |
## Outputs
| Name | Description |
|------|-------------|
| cloudwatchstream-iam-role-arn | n/a |
| firehose-iam-role-arn | n/a |
---
## Authorship
This module was developed by Rackspace.
@@ -0,0 +1,162 @@
resource "aws_kinesis_firehose_delivery_stream" "cwl-s3-firehose-stream" {
name = var.stream-name
destination = "extended_s3"
extended_s3_configuration {
role_arn = aws_iam_role.firehose-stream-iam-role.arn
bucket_arn = var.dest-bucket-arn
prefix = trimprefix(var.dest-bucket-prefix, "/")
error_output_prefix = "FirehoseErrors/"
kms_key_arn = var.dest-bucket-kmskey-arn
compression_format = "GZIP"
cloudwatch_logging_options {
enabled = var.enable-firehose-errorlog
log_group_name = try(aws_cloudwatch_log_group.firehose-log[0].name, null)
log_stream_name = "DestinationDelivery"
}
}
server_side_encryption {
enabled = true
key_type = "CUSTOMER_MANAGED_CMK"
key_arn = var.firehose-kmskey-arn
}
}
resource "aws_cloudwatch_log_group" "firehose-log" {
count = var.enable-firehose-errorlog ? 1 : 0
name = "/aws/kinesisfirehose/${var.stream-name}"
retention_in_days = 365
}
resource "aws_cloudwatch_log_subscription_filter" "cwl-sub-filter" {
log_group_name = var.source-cwlgroup-name
name = "stream-to-s3"
role_arn = aws_iam_role.cwlog-stream-role.arn
filter_pattern = ""
destination_arn = aws_kinesis_firehose_delivery_stream.cwl-s3-firehose-stream.arn
}
resource "random_id" "rid" {
byte_length = 4
}
resource "aws_iam_role" "firehose-stream-iam-role" {
name = "firehose-stream-role-${var.stream-name}-${random_id.rid.dec}"
description = "Kinesis Firehose IAM role for streaming logs from CloudwatchLog to S3"
assume_role_policy = jsonencode(
{
"Version" : "2012-10-17",
"Statement" : [
{
"Sid" : "FirehoseStreaming",
"Effect" : "Allow",
"Principal" : {
"Service" : "firehose.amazonaws.com"
},
"Action" : "sts:AssumeRole"
}
]
}
)
}
resource "aws_iam_role_policy_attachment" "firehose-role-policy-attachment" {
role = aws_iam_role.firehose-stream-iam-role.name
policy_arn = aws_iam_policy.firehose-role-policy.arn
}
resource "aws_iam_policy" "firehose-role-policy" {
name = "kinesis-firehose-log-stream-${var.stream-name}-${random_id.rid.dec}"
description = "Policy for Kinesis Firehose streaming logs to s3"
policy = jsonencode(
{
"Version" : "2012-10-17",
"Statement" : [
{
"Effect" : "Allow",
"Action" : [
"s3:AbortMultipartUpload",
"s3:GetBucketLocation",
"s3:GetObject",
"s3:ListBucket",
"s3:ListBucketMultipartUploads",
"s3:PutObject"
],
"Resource" : [
var.dest-bucket-arn,
"${var.dest-bucket-arn}/*"
]
},
{
"Effect" : "Allow",
"Action" : [
"kms:Decrypt",
"kms:GenerateDataKey"
],
"Resource" : [
var.dest-bucket-kmskey-arn
]
},
{
"Effect" : "Allow",
"Action" : [
"logs:PutLogEvents",
"logs:PutLogEventsBatch",
"logs:CreateLogStream"
],
"Resource" : [
"arn:aws:logs:*:*:log-group:/aws/kinesisfirehose/${var.stream-name}/*"
]
}
]
}
)
}
resource "aws_iam_role" "cwlog-stream-role" {
name = "cloudwatchlog-stream-role-${var.stream-name}-${random_id.rid.dec}"
description = "CloudwatchLog role for streaming to firehose"
assume_role_policy = jsonencode(
{
"Version" : "2012-10-17",
"Statement" : [
{
"Sid" : "CloudwatchLogStreaming",
"Effect" : "Allow",
"Principal" : {
"Service" : "logs.${var.cwl-region}.amazonaws.com"
},
"Action" : "sts:AssumeRole"
}
]
}
)
}
resource "aws_iam_role_policy_attachment" "cwlog-role-policy-attachment" {
role = aws_iam_role.cwlog-stream-role.name
policy_arn = aws_iam_policy.cwlog-role-policy.arn
}
resource "aws_iam_policy" "cwlog-role-policy" {
name = "cloudwatchlog-stream-${var.stream-name}-${random_id.rid.dec}"
description = "Policy for CloudWatch Logs streaming to Kinesis Firehose"
policy = jsonencode(
{
"Version" : "2012-10-17",
"Statement" : [
{
"Effect" : "Allow",
"Action" : ["firehose:PutRecord"],
"Resource" : [
"arn:aws:firehose:${var.cwl-region}:${data.aws_caller_identity.this.account_id}:deliverystream/${var.stream-name}"
]
}
]
}
)
}
data "aws_caller_identity" "this" {}
@@ -0,0 +1,7 @@
output firehose-iam-role-arn {
value = aws_iam_role.firehose-stream-iam-role.arn
}
output cloudwatchstream-iam-role-arn {
value = aws_iam_role.cwlog-stream-role.arn
}
@@ -0,0 +1,40 @@
variable "stream-name" {
type = string
description = "Name of Kinesis Data Firehose delivery stream"
}
variable "firehose-kmskey-arn" {
type = string
description = "KMS Key arn for Firehose"
}
variable "dest-bucket-arn" {
type = string
description = "Destination S3 bucket ARN"
}
variable "dest-bucket-prefix" {
type = string
description = "S3 object prefix for this stream. Please do not start with / end with a /. For example, r53-log/acme.local/"
}
variable "dest-bucket-kmskey-arn" {
type = string
description = "KMS key ARN for destination bucket"
}
variable "source-cwlgroup-name" {
type = string
description = "Name of source CloudwatchLog group"
}
variable "cwl-region" {
type = string
description = "AWS region where Cloudwatch LogGroup resides. Needed for setting up cwlog-stream-role"
}
variable "enable-firehose-errorlog" {
type = bool
description = "Enable firehose errorlog"
default = false
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.0"
}
}
}
@@ -0,0 +1,22 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
## Example
```terraform
module "alb-arns" {
source = "../../modules/util/resource-list"
resource-type = "alb"
}
module "alb-monitoring" {
for_each = toset(split(" ", data.external.alb-arns.result.result))
source = "../../modules/ManagementGovernance/Monitoring.ALB"
default-tags = local.default-tags
load-balancer = each.value
threshold-HealthHostCountMin = 1
}
```
@@ -0,0 +1,6 @@
#!/bin/bash
eval "$(jq -r '@sh "lb=\(.lb)"')"
RESULTS=$(aws elbv2 describe-target-groups --load-balancer-arn $lb --query TargetGroups[*].TargetGroupArn --output text --no-cli-pager | sed 's/\t/\n/g' | sort | xargs)
jq -n --arg result "$RESULTS" '{"result":$result}'
@@ -0,0 +1,110 @@
locals {
alb-name = "app/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}"
}
resource "aws_cloudwatch_metric_alarm" "alb-HTTPCode_ELB_5XX_Count" {
alarm_name = "${var.settings.HTTPCode_ELB_5XX_Count.ecccode}-ALB_${local.alb-name}-HTTPCode_ELB_5XX_Count"
comparison_operator = var.settings.HTTPCode_ELB_5XX_Count.comparison_operator
evaluation_periods = var.settings.HTTPCode_ELB_5XX_Count.evaluation_periods
metric_name = "HTTPCode_ELB_5XX_Count"
period = var.settings.HTTPCode_ELB_5XX_Count.period
statistic = var.settings.HTTPCode_ELB_5XX_Count.statistic
threshold = var.settings.HTTPCode_ELB_5XX_Count.threshold
alarm_description = "ALB:HTTPCode_ELB_5XX_Count"
namespace = "AWS/ApplicationELB"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.HTTPCode_ELB_5XX_Count.action]
ok_actions = [var.settings.HTTPCode_ELB_5XX_Count.action]
dimensions = {
LoadBalancer = local.alb-name
}
}
resource "aws_cloudwatch_metric_alarm" "alb-TargetConnectionErrorCount" {
alarm_name = "${var.settings.TargetConnectionErrorCount.ecccode}-ALB_${local.alb-name}-TargetConnectionErrorCount"
comparison_operator = var.settings.TargetConnectionErrorCount.comparison_operator
evaluation_periods = var.settings.TargetConnectionErrorCount.evaluation_periods
metric_name = "TargetConnectionErrorCount"
period = var.settings.TargetConnectionErrorCount.period
statistic = var.settings.TargetConnectionErrorCount.statistic
threshold = var.settings.TargetConnectionErrorCount.threshold
alarm_description = "ALB:TargetConnectionErrorCount"
namespace = "AWS/ApplicationELB"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.TargetConnectionErrorCount.action]
ok_actions = [var.settings.TargetConnectionErrorCount.action]
dimensions = {
LoadBalancer = local.alb-name
}
}
resource "aws_cloudwatch_metric_alarm" "alb-TargetResponseTime" {
alarm_name = "${var.settings.TargetResponseTime.ecccode}-ALB_${local.alb-name}-TargetResponseTime"
comparison_operator = var.settings.TargetResponseTime.comparison_operator
evaluation_periods = var.settings.TargetResponseTime.evaluation_periods
metric_name = "TargetResponseTime"
period = var.settings.TargetResponseTime.period
statistic = var.settings.TargetResponseTime.statistic
threshold = var.settings.TargetResponseTime.threshold
alarm_description = "ALB:TargetResponseTime"
namespace = "AWS/ApplicationELB"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.TargetResponseTime.action]
ok_actions = [var.settings.TargetResponseTime.action]
dimensions = {
LoadBalancer = local.alb-name
}
}
/*
module "alb-targetgroups" {
source = "../../util/resource-list"
resource-type = "alb-targetgroups"
query-input = var.load-balancer
asrolearn = var.asrolearn
}
*/
// causes Rate exceeded error, maybe because of adaptive AWS_RETRY_MODE?
/*
module "alb_tgs" {
assume_role_arn = var.asrolearn
role_session_name = "terraform-resource-list"
source = "../../util/terraform-aws-cli"
aws_cli_commands = ["elbv2", "describe-target-groups", "--load-balancer-arn", var.load-balancer]
aws_cli_query = "TargetGroups[*].TargetGroupArn"
}
*/
module alb_tgs {
source = "../../util/awscli"
access_key = var.target-account-ak
aws_cli_commands = "elbv2 describe-target-groups --load-balancer-arn ${var.load-balancer} --query TargetGroups[*].TargetGroupArn"
secret_key = var.target-account-sk
session_token = var.target-account-token
}
resource "aws_cloudwatch_metric_alarm" "alb-HealthyHostCount" {
# for_each = module.alb-targetgroups.result-set
for_each = toset(module.alb_tgs.awscliout)
alarm_name = "${var.settings.HealthHostCountMin.ecccode}-ALBTG_:${split(":", each.value)[5]}-HealthyHostCount"
comparison_operator = var.settings.HealthHostCountMin.comparison_operator
evaluation_periods = var.settings.HealthHostCountMin.evaluation_periods
metric_name = "HealthyHostCount"
period = var.settings.HealthHostCountMin.period
statistic = var.settings.HealthHostCountMin.statistic
threshold = var.settings.HealthHostCountMin.threshold
alarm_description = "ALBTG:HealthyHostCount"
namespace = "AWS/ApplicationELB"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.HealthHostCountMin.action]
ok_actions = [var.settings.HealthHostCountMin.action]
dimensions = {
TargetGroup = split(":", each.value)[5]
LoadBalancer = "app/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}"
}
}
@@ -0,0 +1,4 @@
output alb-tg-count {
# value = length(module.alb-targetgroups.result-set)
value = length(flatten(module.alb_tgs.awscliout))
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.36.1"
}
}
}
@@ -0,0 +1,8 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable load-balancer {}
variable settings {}
# variable asrolearn {}
variable target-account-ak {}
variable target-account-sk {}
variable target-account-token {}
@@ -0,0 +1,24 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
## Example
```terraform
module "asg" {
source = "../../modules/util/resource-list"
resource-type = "asg"
}
module "asg-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.asg.result-set
source = "../../modules/ManagementGovernance/Monitoring.ASG"
default-tags = local.default-tags
asg-name = each.value
threshold-CPUUtilization = 90
actions-enabled = var.actions-enabled
sns-targets = var.sns-targets
}
```
@@ -0,0 +1,41 @@
data "aws_autoscaling_group" "asg" {
name = var.asg-name
}
resource "aws_cloudwatch_metric_alarm" "asg-CPUUtilization" {
alarm_name = "${var.settings.CPUUtilization.ecccode}-ASG_${var.asg-name}-CPUUtilization"
comparison_operator = var.settings.CPUUtilization.comparison_operator
evaluation_periods = var.settings.CPUUtilization.evaluation_periods
metric_name = "CPUUtilization"
period = var.settings.CPUUtilization.period
statistic = var.settings.CPUUtilization.statistic
threshold = var.settings.CPUUtilization.threshold
alarm_description = "ASG:CPUUtilization"
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.CPUUtilization.action]
ok_actions = [var.settings.CPUUtilization.action]
dimensions = {
AutoScalingGroupName =var.asg-name
}
}
resource "aws_cloudwatch_metric_alarm" "asg-GroupInServiceCapacity" {
alarm_name = "${var.settings.GroupInServiceCapacity.ecccode}-ASG_${var.asg-name}-GroupInServiceCapacity"
comparison_operator = "LessThanThreshold"
evaluation_periods = var.settings.GroupInServiceCapacity.evaluation_periods
metric_name = "GroupInServiceCapacity"
period = var.settings.GroupInServiceCapacity.period
statistic = "Minimum"
threshold = data.aws_autoscaling_group.asg.min_size
alarm_description = "ASG:GroupInServiceCapacity"
namespace = "AWS/AutoScaling"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.GroupInServiceCapacity.action]
ok_actions = [var.settings.GroupInServiceCapacity.action]
dimensions = {
AutoScalingGroupName = var.asg-name
}
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.36.1"
}
}
}
@@ -0,0 +1,5 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable asg-name {}
variable settings {}
variable ecccode {}
@@ -0,0 +1,74 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
## Example
```terraform
module "ec2-instances" {
source = "../../modules/util/resource-list"
resource-type = "ec2"
}
module "ec2-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.ec2-instances.result-set
source = "../../modules/ManagementGovernance/Monitoring.EC2"
default-tags = local.default-tags
ec2-instance-id = each.value
threshold-CPUUtilization = 90
threshold-mem_free = 100000
threshold-swap_free = 100000
threshold-disk_free = 1 * 1000 * 1000 * 1000
threshold-disk_inodes_free = 10000
threshold-processes_total = 500
threshold-LogicalDiskFreePct = 10
threshold-MemoryCommittedPct = 90
actions-enabled = var.actions-enabled
sns-targets = var.sns-targets
}
```
## Sample cloudwatch alarm email notification
```
Subject: ALARM: "TestAlarmPleaseIgnore" in Asia Pacific (Hong Kong)
You are receiving this email because your Amazon CloudWatch Alarm "TestAlarmPleaseIgnore" in the
Asia Pacific (Hong Kong) region has entered the ALARM state, because "Threshold Crossed: 1 out of
the last 1 datapoints [864.0 (24/01/24 00:56:00)] was less than or equal to the threshold (900.0)
(minimum 1 datapoint for OK -> ALARM transition)." at "Wednesday 24 January, 2024 01:01:34 UTC".
View this alarm in the AWS Management Console:
https://ap-east-1.console.aws.amazon.com%2Fcloudwatch...
Alarm Details:
- Name: TestAlarmPleaseIgnore
- Description: Cloudwatch alarm for the following resource
- Instance ID: xxx
- Instance Name: yyy
- Instance IP: zz.zz.zz.zz
- State Change: OK -> ALARM
- Reason for State Change: Threshold Crossed: 1 out of the last 1 datapoints [864.0 (24/01/24 00:56:00)] was less than or equal to the threshold (900.0) (minimum 1 datapoint for OK -> ALARM transition).
- Timestamp: Wednesday 24 January, 2024 01:01:34 UTC
- AWS Account: 111122223333
- Alarm Arn: arn:aws:cloudwatch:ap-east-1:111122223333:alarm:TestAlarmPleaseIgnore
Threshold:
- The alarm is in the ALARM state when the metric is LessThanOrEqualToThreshold 900.0 for at least 1 of the last 1 period(s) of 300 seconds.
Monitored Metric:
- MetricNamespace: AWS/EC2
- MetricName: CPUCreditBalance
- Dimensions: [InstanceId = i-050d4adeafaa53cd0]
- Period: 300 seconds
- Statistic: Average
- Unit: not specified
- TreatMissingData: missing
State Change Actions:
- OK:
- ALARM: [arn:aws:sns:ap-east-1:111122223333:CWA-SNS-Email-KenFong]
- INSUFFICIENT_DATA:
```
@@ -0,0 +1,22 @@
#!/bin/bash
eval "$(jq -r '@sh "export id=\(.input) asrolearn=\(.asrolearn)"')"
eval $(aws sts assume-role --role-arn $asrolearn --role-session-name awscli | jq -cr '"export AWS_ACCESS_KEY_ID=" + .Credentials.AccessKeyId, "export AWS_SECRET_ACCESS_KEY=" + .Credentials.SecretAccessKey, "export AWS_SESSION_TOKEN=" + .Credentials.SessionToken, "export AWS_SESSION_EXPIRATION=" + .Credentials.Expiration')
aws cloudwatch list-metrics --namespace CWAgent --metric-name disk_inodes_free \
--dimensions Name=InstanceId,Value=$id Name=path,Value=/ | \
jq '.Metrics[] | .Dimensions[] | select ((.Name=="device") or (.Name=="fstype")) | { (.Name): (.Value)}' | \
jq -s 'add // {"device":"unknown", "fstype":"unknown"}'
exit 0
DEVICE=$(aws cloudwatch list-metrics --namespace CWAgent --metric-name disk_inodes_free \
--dimensions Name=InstanceId,Value=$id Name=path,Value=/ \
--query 'Metrics[].Dimensions[?Name==`device`].Value' --output text)
FSTYPE=$(aws cloudwatch list-metrics --namespace CWAgent --metric-name disk_inodes_free \
--dimensions Name=InstanceId,Value=$id Name=path,Value=/ \
--query 'Metrics[].Dimensions[?Name==`fstype`].Value' --output text)
jq -n --arg device "$DEVICE" --arg fstype "$FSTYPE" '{"device":$device,"fstype":$fstype}'
@@ -0,0 +1,25 @@
#!/bin/bash
# Get the query
TERRAFORM_QUERY=$(jq -Mc .)
# Extract the query attributes
access_key=$(echo "${TERRAFORM_QUERY}" | jq -r '.access_key')
secret_key=$(echo "${TERRAFORM_QUERY}" | jq -r '.secret_key')
session_token=$(echo "${TERRAFORM_QUERY}" | jq -r '.session_token')
iid=$(echo "${TERRAFORM_QUERY}" | jq -r '.iid')
# eval "$(jq -r '@sh "export id=\(.input) asrolearn=\(.asrolearn)"')"
# eval $(aws sts assume-role --role-arn $asrolearn --role-session-name awscli | jq -cr '"export AWS_ACCESS_KEY_ID=" + .Credentials.AccessKeyId, "export AWS_SECRET_ACCESS_KEY=" + .Credentials.SecretAccessKey, "export AWS_SESSION_TOKEN=" + .Credentials.SessionToken, "export AWS_SESSION_EXPIRATION=" + .Credentials.Expiration')
export AWS_ACCESS_KEY_ID=$access_key
export AWS_SECRET_ACCESS_KEY=$secret_key
export AWS_SESSION_TOKEN=$session_token
#aws cloudwatch list-metrics --namespace CWAgent --metric-name disk_inodes_free \
#--dimensions Name=InstanceId,Value=$iid Name=path,Value=/ | \
#jq '.Metrics[] | .Dimensions[] | {(.Name):(.Value)}' | jq -s 'add'
# when there are multiple metrics with the same name...
aws cloudwatch list-metrics --namespace CWAgent --metric-name disk_inodes_free \
--dimensions Name=InstanceId,Value=$iid Name=path,Value=/ --query Metrics[] | \
jq '. | last | .Dimensions[] | {(.Name):(.Value)}' | jq -s 'add'
@@ -0,0 +1,12 @@
#!/bin/bash
eval "$(jq -r '@sh "export id=\(.input) asrolearn=\(.asrolearn)"')"
eval $(aws sts assume-role --role-arn $asrolearn --role-session-name awscli | jq -cr '"export AWS_ACCESS_KEY_ID=" + .Credentials.AccessKeyId, "export AWS_SECRET_ACCESS_KEY=" + .Credentials.SecretAccessKey, "export AWS_SESSION_TOKEN=" + .Credentials.SessionToken, "export AWS_SESSION_EXPIRATION=" + .Credentials.Expiration')
EC2OS=$(aws ec2 describe-instances --instance-ids $id | jq -r '.Reservations[].Instances[].PlatformDetails')
if [ $EC2OS == "Windows" ]; then
echo '{"os": "Windows"}'
else
echo '{"os": "Linux"}'
fi
@@ -0,0 +1,395 @@
locals {
# alarm-message limited to 1024 characters
alarm-message = <<EOF
Cloudwatch alarm for the following resource
- Instance ID: ${var.ec2-instance-id}
- Instance Name: ${data.aws_instance.ec2-instance.tags["Name"]}
- Instance IP: ${data.aws_instance.ec2-instance.private_ip}
- Instance Type: ${data.aws_instance.ec2-instance.instance_type}
EOF
}
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_System" {
alarm_name = "${var.settings.StatusCheckFailed_System.ecccode}-EC2_${var.ec2-instance-id}-StatusCheckFailed_System"
comparison_operator = var.settings.StatusCheckFailed_System.comparison_operator
evaluation_periods = var.settings.StatusCheckFailed_System.evaluation_periods
metric_name = "StatusCheckFailed_System"
period = var.settings.StatusCheckFailed_System.period
statistic = var.settings.StatusCheckFailed_System.statistic
threshold = var.settings.StatusCheckFailed_System.threshold
# alarm_description = "EC2:StatusCheckFailed_System"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.StatusCheckFailed_System.action]
ok_actions = [var.settings.StatusCheckFailed_System.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-StatusCheckFailed_Instance" {
alarm_name = "${var.settings.StatusCheckFailed_Instance.ecccode}-EC2_${var.ec2-instance-id}-StatusCheckFailed_Instance"
comparison_operator = var.settings.StatusCheckFailed_Instance.comparison_operator
evaluation_periods = var.settings.StatusCheckFailed_Instance.evaluation_periods
metric_name = "StatusCheckFailed_Instance"
period = var.settings.StatusCheckFailed_Instance.period
statistic = var.settings.StatusCheckFailed_Instance.statistic
threshold = var.settings.StatusCheckFailed_Instance.threshold
# alarm_description = "EC2:StatusCheckFailed_Instance"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.StatusCheckFailed_Instance.action]
ok_actions = [var.settings.StatusCheckFailed_Instance.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-CPUUtilization" {
alarm_name = "${var.settings.CPUUtilization.ecccode}-EC2_${var.ec2-instance-id}-CPUUtilization"
comparison_operator = var.settings.CPUUtilization.comparison_operator
evaluation_periods = var.settings.CPUUtilization.evaluation_periods
metric_name = "CPUUtilization"
period = var.settings.CPUUtilization.period
statistic = var.settings.CPUUtilization.statistic
threshold = var.settings.CPUUtilization.threshold
# alarm_description = "EC2:CPUUtilization"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.CPUUtilization.action]
ok_actions = [var.settings.CPUUtilization.action]
treat_missing_data = "notBreaching"
dimensions = {
InstanceId = var.ec2-instance-id
}
}
# cwagent metrics
data "aws_instance" "ec2-instance" {
instance_id = var.ec2-instance-id
}
# put instance name or ip in alarm name
locals {
instance-ip = data.aws_instance.ec2-instance.private_ip
instance-name = data.aws_instance.ec2-instance.tags["Name"]
}
module "ec2_os" {
source = "../../util/awscli"
access_key = var.target-account-ak
aws_cli_commands = "ec2 describe-instances --instance-ids ${var.ec2-instance-id} --query Reservations[].Instances[].PlatformDetails"
secret_key = var.target-account-sk
session_token = var.target-account-token
}
# Linux specific checks
# default cw agent uses mem_used_percent metric
# detect presense of cloudwatch agent
module "detect_cloudwatch_agent" {
source = "../../util/awscli"
access_key = var.target-account-ak
secret_key = var.target-account-sk
session_token = var.target-account-token
aws_cli_commands = "cloudwatch list-metrics --namespace CWAgent --dimensions Name=InstanceId,Value=${var.ec2-instance-id} --query Metrics[].MetricName --max-items 1"
}
resource "aws_cloudwatch_metric_alarm" "ec2-mem_used_percent" {
count = module.ec2_os.awscliout[0] != "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.mem_used_percent.ecccode}-EC2_${var.ec2-instance-id}-mem_used_percent"
comparison_operator = var.settings.mem_used_percent.comparison_operator
evaluation_periods = var.settings.mem_used_percent.evaluation_periods
metric_name = "mem_used_percent"
period = var.settings.mem_used_percent.period
statistic = var.settings.mem_used_percent.statistic
threshold = var.settings.mem_used_percent.threshold
# alarm_description = "EC2:mem_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.mem_used_percent.action]
ok_actions = [var.settings.mem_used_percent.action]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}
data "external" "cw-dimensions" {
program = ["bash", "${path.module}/get-cwagent-dimensions.sh"]
query = {
iid = var.ec2-instance-id
access_key = var.target-account-ak
secret_key = var.target-account-sk
session_token = var.target-account-token
}
}
/* module returns blank
module "cw-dimensions" {
source = "../../util/awscli"
access_key = var.target-account-ak
aws_cli_commands = "cloudwatch list-metrics --namespace CWAgent --metric-name disk_inodes_free --dimensions Name=InstanceId,Value=${var.ec2-instance-id} Name=path,Value=/ --query Metrics[].Dimensions[] | jq '.[] | {(.Name):(.Value)}' | jq -s 'add'"
secret_key = var.target-account-sk
session_token = var.target-account-token
}
*/
resource "aws_cloudwatch_metric_alarm" "ec2-swap_used_percent" {
count = module.ec2_os.awscliout[0] != "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.swap_used_percent.ecccode}-EC2_${var.ec2-instance-id}-swap_used_percent"
comparison_operator = var.settings.swap_used_percent.comparison_operator
evaluation_periods = var.settings.swap_used_percent.evaluation_periods
metric_name = "swap_used_percent"
period = var.settings.swap_used_percent.period
statistic = var.settings.swap_used_percent.statistic
threshold = var.settings.swap_used_percent.threshold
# alarm_description = "EC2:swap_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.swap_used_percent.action]
ok_actions = [var.settings.swap_used_percent.action]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent_warn" {
count = module.ec2_os.awscliout[0] != "Windows" && data.external.cw-dimensions.result != null ? 1 : 0
alarm_name = "${var.settings.disk_used_percent_warn.ecccode}-EC2_${var.ec2-instance-id}-disk_used_percent"
comparison_operator = var.settings.disk_used_percent_warn.comparison_operator
evaluation_periods = var.settings.disk_used_percent_warn.evaluation_periods
metric_name = "disk_used_percent"
period = var.settings.disk_used_percent_warn.period
statistic = var.settings.disk_used_percent_warn.statistic
threshold = var.settings.disk_used_percent_warn.threshold
# alarm_description = "EC2:disk_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.disk_used_percent_warn.action]
ok_actions = [var.settings.disk_used_percent_warn.action]
dimensions = data.external.cw-dimensions.result
}
resource "aws_cloudwatch_metric_alarm" "ec2-disk_used_percent_crit" {
count = module.ec2_os.awscliout[0] != "Windows" && data.external.cw-dimensions.result != null ? 1 : 0
alarm_name = "${var.settings.disk_used_percent_crit.ecccode}-EC2_${var.ec2-instance-id}-disk_used_percent"
comparison_operator = var.settings.disk_used_percent_crit.comparison_operator
evaluation_periods = var.settings.disk_used_percent_crit.evaluation_periods
metric_name = "disk_used_percent"
period = var.settings.disk_used_percent_crit.period
statistic = var.settings.disk_used_percent_crit.statistic
threshold = var.settings.disk_used_percent_crit.threshold
# alarm_description = "EC2:disk_used_percent"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.disk_used_percent_crit.action]
ok_actions = [var.settings.disk_used_percent_crit.action]
dimensions = data.external.cw-dimensions.result
}
resource "aws_cloudwatch_metric_alarm" "ec2-disk_inodes_free" {
count = module.ec2_os.awscliout[0] != "Windows" && data.external.cw-dimensions.result != null ? 1 : 0
alarm_name = "${var.settings.disk_inodes_free.ecccode}-EC2_${var.ec2-instance-id}-disk_inodes_free"
comparison_operator = var.settings.disk_inodes_free.comparison_operator
evaluation_periods = var.settings.disk_inodes_free.evaluation_periods
metric_name = "disk_inodes_free"
period = var.settings.disk_inodes_free.period
statistic = var.settings.disk_inodes_free.statistic
threshold = var.settings.disk_inodes_free.threshold
# alarm_description = "EC2:disk_inodes_free"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.disk_inodes_free.action]
ok_actions = [var.settings.disk_inodes_free.action]
dimensions = data.external.cw-dimensions.result
}
# process metric not published by default cw agent config
resource "aws_cloudwatch_metric_alarm" "ec2-processes_total" {
count = module.ec2_os.awscliout[0] != "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.processes_total.ecccode}-EC2_${var.ec2-instance-id}-processes_total"
comparison_operator = var.settings.processes_total.comparison_operator
evaluation_periods = var.settings.processes_total.evaluation_periods
metric_name = "processes_total"
period = var.settings.processes_total.period
statistic = var.settings.processes_total.statistic
threshold = var.settings.processes_total.threshold
# alarm_description = "EC2:processes_total"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.processes_total.action]
ok_actions = [var.settings.processes_total.action]
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-net_err" {
count = module.ec2_os.awscliout[0] != "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.net_err_in.ecccode}-EC2_${var.ec2-instance-id}-net_err"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = var.settings.net_err_in.evaluation_periods
threshold = 0
# alarm_description = "EC2:net_err_in or EC2:net_err_out exceeds threshold"
alarm_description = local.alarm-message
insufficient_data_actions = []
actions_enabled = false
alarm_actions = [var.settings.net_err_in.action]
ok_actions = [var.settings.net_err_in.action]
treat_missing_data = "notBreaching"
metric_query {
id = "e1"
expression = "IF(m1 > ${var.settings.net_err_in.threshold} OR m2 > ${var.settings.net_err_out.threshold}, 1, 0)"
label = "net_err_exceeds_threshold"
return_data = "true"
}
metric_query {
id = "m1"
metric {
metric_name = "net_err_in"
namespace = "CWAgent"
period = var.settings.net_err_in.period
stat = var.settings.net_err_in.statistic
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
interface = "eth0"
}
}
}
metric_query {
id = "m2"
metric {
metric_name = "net_err_out"
namespace = "CWAgent"
period = var.settings.net_err_out.period
stat = var.settings.net_err_out.statistic
dimensions = {
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
interface = "eth0"
}
}
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-NetworkIn" {
count = try(var.settings.NetworkIn.monitor, false) ? 1 : 0
alarm_name = "${var.settings.NetworkIn.ecccode}-EC2_${var.ec2-instance-id}-NetworkIn"
comparison_operator = var.settings.NetworkIn.comparison_operator
evaluation_periods = var.settings.NetworkIn.evaluation_periods
metric_name = "NetworkIn"
period = var.settings.NetworkIn.period
statistic = var.settings.NetworkIn.statistic
threshold = var.settings.NetworkIn.threshold
# alarm_description = "EC2:NetworkIn"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.NetworkIn.action]
ok_actions = [var.settings.NetworkIn.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-NetworkOut" {
count = try(var.settings.NetworkIn.monitor, false) ? 1 : 0
alarm_name = "${var.settings.NetworkOut.ecccode}-EC2_${var.ec2-instance-id}-NetworkOut"
comparison_operator = var.settings.NetworkOut.comparison_operator
evaluation_periods = var.settings.NetworkOut.evaluation_periods
metric_name = "NetworkOut"
period = var.settings.NetworkOut.period
statistic = var.settings.NetworkOut.statistic
threshold = var.settings.NetworkOut.threshold
# alarm_description = "EC2:NetworkOut"
alarm_description = local.alarm-message
namespace = "AWS/EC2"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.NetworkOut.action]
ok_actions = [var.settings.NetworkOut.action]
dimensions = {
InstanceId = var.ec2-instance-id
}
}
# Windows specific checks
resource "aws_cloudwatch_metric_alarm" "ec2-MemoryCommittedPct" {
count = module.ec2_os.awscliout[0] == "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.MemoryCommittedPct.ecccode}-EC2_${var.ec2-instance-id}-MemoryCommittedPct"
comparison_operator = var.settings.MemoryCommittedPct.comparison_operator
evaluation_periods = var.settings.MemoryCommittedPct.evaluation_periods
metric_name = "Memory % Committed Bytes In Use"
period = var.settings.MemoryCommittedPct.period
statistic = var.settings.MemoryCommittedPct.statistic
threshold = var.settings.MemoryCommittedPct.threshold
# alarm_description = "EC2:MemoryCommittedBytes"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.MemoryCommittedPct.action]
ok_actions = [var.settings.MemoryCommittedPct.action]
dimensions = {
objectname = "Memory"
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}
resource "aws_cloudwatch_metric_alarm" "ec2-LogicalDiskFreePct" {
count = module.ec2_os.awscliout[0] == "Windows" && length(module.detect_cloudwatch_agent.awscliout) > 0 ? 1 : 0
alarm_name = "${var.settings.LogicalDiskFreePct.ecccode}-EC2_${var.ec2-instance-id}-LogicalDiskFreePct"
comparison_operator = var.settings.LogicalDiskFreePct.comparison_operator
evaluation_periods = var.settings.LogicalDiskFreePct.evaluation_periods
metric_name = "LogicalDisk % Free Space"
period = var.settings.LogicalDiskFreePct.period
statistic = var.settings.LogicalDiskFreePct.statistic
threshold = var.settings.LogicalDiskFreePct.threshold
# alarm_description = "EC2:OsDiskFreePct"
alarm_description = local.alarm-message
namespace = "CWAgent"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.LogicalDiskFreePct.action]
ok_actions = [var.settings.LogicalDiskFreePct.action]
dimensions = {
instance = "C:"
objectname = "LogicalDisk"
InstanceId = var.ec2-instance-id
ImageId = data.aws_instance.ec2-instance.ami
InstanceType = data.aws_instance.ec2-instance.instance_type
}
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.36.1"
}
}
}
@@ -0,0 +1,8 @@
variable "cw-alarm-prefix" {}
variable "actions-enabled" {}
variable "ec2-instance-id" {}
variable "settings" {}
# variable asrolearn {}
variable target-account-ak {}
variable target-account-sk {}
variable target-account-token {}
@@ -0,0 +1,27 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
Unlike other monitoring modules which discovers resources details automatically, EKS pod name need to be supplied to this module.
AWS cli does not provide pod information.
## Example
```terraform
data "aws_eks_clusters" "eks-clusters" {}
module "eks-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = data.aws_eks_clusters.eks-clusters.names
source = "../../modules/ManagementGovernance/Monitoring.EKS"
default-tags = local.default-tags
cluster-name = each.value
eks-namespace = "default"
pod-names = ["depl-nginx", "depl-alpine"]
threshold-pod_cpu_utilization = 85
threshold-pod_memory_utilization = 85
threshold-pod_number_of_container_restarts = 5
actions-enabled = var.actions-enabled
sns-targets = local.sns-targets
}
```
@@ -0,0 +1,69 @@
// The following checks requires container insights
resource "aws_cloudwatch_metric_alarm" "eks-pod_cpu_utilization" {
for_each = toset(var.pod-names)
alarm_name = "${each.value["ecccode"]}:${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm1.metric}"
comparison_operator = var.settings.alarm1.comparison_operator
evaluation_periods = var.settings.alarm1.evaluation_periods
metric_name = var.settings.alarm1.metric
period = var.settings.alarm1.period
statistic = var.settings.alarm1.statistic
threshold = var.settings.alarm1.threshold
alarm_description = "EKS:${var.settings.alarm1.metric}"
namespace = "ContainerInsights"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.alarm1.action]
ok_actions = [var.settings.alarm1.action]
dimensions = {
"PodName" = each.value
"ClusterName" = var.cluster-name
"Namespace" = var.eks-namespace
}
}
resource "aws_cloudwatch_metric_alarm" "eks-pod_memory_utilization" {
for_each = toset(var.pod-names)
alarm_name = "${each.value["ecccode"]}:${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm2.metric}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = var.settings.alarm2.metric
period = var.settings.alarm2.period
statistic = var.settings.alarm2.statistic
threshold = var.settings.alarm2.threshold
alarm_description = "EKS:${var.settings.alarm2.metric}"
namespace = "ContainerInsights"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.alarm2.action]
ok_actions = [var.settings.alarm2.action]
dimensions = {
"PodName" = each.value
"ClusterName" = var.cluster-name
"Namespace" = var.eks-namespace
}
}
resource "aws_cloudwatch_metric_alarm" "eks-pod_number_of_container_restarts" {
for_each = toset(var.pod-names)
alarm_name = "${each.value["ecccode"]}:${var.cw-alarm-prefix}:EKS:${var.cluster-name}:${each.value}:${var.settings.alarm3.metric}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "3"
metric_name = var.settings.alarm3.metric
period = var.settings.alarm3.period
statistic = var.settings.alarm3.statistic
threshold = var.settings.alarm3.threshold
alarm_description = "EKS:${var.settings.alarm3.metric}"
namespace = "ContainerInsights"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.alarm3.action]
ok_actions = [var.settings.alarm3.action]
dimensions = {
"PodName" = each.value
"ClusterName" = var.cluster-name
"Namespace" = var.eks-namespace
}
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.36.1"
}
}
}
@@ -0,0 +1,8 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable cluster-name {}
variable eks-namespace {}
variable pod-names {
type = list
}
variable settings {}
@@ -0,0 +1,25 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
## Example
```terraform
module "emr-clusters" {
source = "../../modules/util/resource-list"
resource-type = "emr"
}
module "emr-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.emr-clusters.result-set
source = "../../modules/ManagementGovernance/Monitoring.EMR"
default-tags = local.default-tags
job-flow-id = split("/", each.value)[1]
threshold-AppsPending = 2
threshold-CapacityRemainingGB = 100
actions-enabled = var.actions-enabled
sns-targets = var.sns-targets
}
```
@@ -0,0 +1,19 @@
resource "aws_cloudwatch_metric_alarm" "emr-alarms" {
for_each = var.settings
alarm_name = "${each.value["ecccode"]}-EMR_${var.job-flow-id}-${each.value["metric"]}"
comparison_operator = each.value["comparison_operator"]
evaluation_periods = each.value["evaluation_periods"]
metric_name = each.value["metric"]
period = each.value["period"]
statistic = each.value["statistic"]
threshold = each.value["threshold"]
alarm_description = "EMR:${each.value["metric"]}"
namespace = "AWS/ElasticMapReduce"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [each.value["action"]]
ok_actions = [each.value["action"]]
dimensions = {
JobFlowId = var.job-flow-id
}
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.36.1"
}
}
}
@@ -0,0 +1,4 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable job-flow-id {}
variable settings {}
@@ -0,0 +1,5 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
@@ -0,0 +1,46 @@
resource "aws_cloudwatch_event_rule" "EventRule" {
name = "${var.cw-alarm-prefix}-health-events"
description = "A CloudWatch Event Rule that triggers on changes in the status of AWS Personal Health Dashboard (AWS Health) and forwards the events to an SNS topic."
state = var.actions-enabled
event_pattern = <<PATTERN
{
"detail": {
"service": ["DIRECTCONNECT", "VPN", "LAMBDA", "EC2", "RDS"]
},
"detail-type": [
"AWS Health Event"
],
"source": [
"aws.health"
]
}
PATTERN
lifecycle {
ignore_changes = [tags["LastModified"]]
}
}
resource "aws_cloudwatch_event_target" "TargetForEventRule" {
rule = aws_cloudwatch_event_rule.EventRule.name
# target_id = "health-event-notification-sns"
arn = var.settings.healthEvents.action
input_transformer {
input_paths = {
"account" : "$.account",
"endTime" : "$.detail.endTime",
"message" : "$.detail.eventDescription[0].latestDescription",
"resources" : "$.resources",
"service" : "$.detail.service",
"startTime" : "$.detail.startTime"
}
input_template = <<EOF
"A maintenance has been scheduled for <service> on AWS account <account>."
"Resources: <resources>"
"Start time: <startTime>"
"End time: <endTime>"
"Detail: <message>"
EOF
}
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.36.1"
}
}
}
@@ -0,0 +1,3 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable settings {}
@@ -0,0 +1,24 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
## Example
```terraform
module "kafka-clusters" {
source = "../../modules/util/resource-list"
resource-type = "kafka"
}
module "kafka-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.kafka-clusters.result-set
source = "../../modules/ManagementGovernance/Monitoring.Kafka"
default-tags = local.default-tags
cluster-name = each.value
threshold-ZooKeeperRequestLatencyMsMean = 30
actions-enabled = var.actions-enabled
sns-targets = var.sns-targets
}
```
@@ -0,0 +1,116 @@
resource "aws_cloudwatch_metric_alarm" "Kafka-ZooKeeperRequestLatencyMsMean" {
alarm_name = "${var.settings.ZooKeeperRequestLatencyMsMean.ecccode}-Kafka_${var.cluster-name}-ZooKeeperRequestLatencyMsMean"
comparison_operator = var.settings.ZooKeeperRequestLatencyMsMean.comparison_operator
evaluation_periods = var.settings.ZooKeeperRequestLatencyMsMean.evaluation_periods
metric_name = "ZooKeeperRequestLatencyMsMean"
period = var.settings.ZooKeeperRequestLatencyMsMean.period
statistic = var.settings.ZooKeeperRequestLatencyMsMean.statistic
threshold = var.settings.ZooKeeperRequestLatencyMsMean.threshold
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
namespace = "AWS/Kafka"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.ZooKeeperRequestLatencyMsMean.action]
ok_actions = [var.settings.ZooKeeperRequestLatencyMsMean.action]
dimensions = {
"Cluster Name" = var.cluster-name
}
}
data "aws_msk_cluster" "msk-cluster" {
cluster_name = var.cluster-name
}
data "aws_msk_broker_nodes" "msk-broker" {
cluster_arn = data.aws_msk_cluster.msk-cluster.arn
}
resource "aws_cloudwatch_metric_alarm" "Kafka-CpuUserSystem" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.settings.CpuUserSystem.ecccode}-Kafka_${var.cluster-name}-${each.value}-CpuUsage"
comparison_operator = var.settings.CpuUserSystem.comparison_operator
evaluation_periods = var.settings.CpuUserSystem.evaluation_periods
threshold = var.settings.CpuUserSystem.threshold
alarm_description = "Kafka:ZooKeeperRequestLatencyMsMean"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.CpuUserSystem.action]
ok_actions = [var.settings.CpuUserSystem.action]
metric_query {
id = "m1"
metric {
metric_name = "CpuUser"
namespace = "AWS/Kafka"
period = var.settings.CpuUserSystem.period
stat = var.settings.CpuUserSystem.statistic
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
}
}
metric_query {
id = "m2"
metric {
metric_name = "CpuSystem"
namespace = "AWS/Kafka"
period = var.settings.CpuUserSystem.period
stat = var.settings.CpuUserSystem.statistic
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
}
}
metric_query {
id = "e1"
expression = "m1 + m2"
label = "CpuUserSystem"
return_data = "true"
}
}
resource "aws_cloudwatch_metric_alarm" "Kafka-KafkaDataLogsDiskUsed" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.settings.KafkaDataLogsDiskUsed.ecccode}-Kafka_${var.cluster-name}-${each.value}-KafkaDataLogsDiskUsed"
comparison_operator = var.settings.KafkaDataLogsDiskUsed.comparison_operator
evaluation_periods = var.settings.KafkaDataLogsDiskUsed.evaluation_periods
metric_name = "KafkaDataLogsDiskUsed"
period = var.settings.KafkaDataLogsDiskUsed.period
statistic = var.settings.KafkaDataLogsDiskUsed.statistic
threshold = var.settings.KafkaDataLogsDiskUsed.threshold
alarm_description = "Kafka:KafkaDataLogsDiskUsed"
namespace = "AWS/Kafka"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.KafkaDataLogsDiskUsed.action]
ok_actions = [var.settings.KafkaDataLogsDiskUsed.action]
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
}
resource "aws_cloudwatch_metric_alarm" "Kafka-HeapMemoryAfterGC" {
for_each = toset([for i in data.aws_msk_broker_nodes.msk-broker.node_info_list[*].broker_id : tostring(i)])
alarm_name = "${var.settings.HeapMemoryAfterGC.ecccode}-Kafka_${var.cluster-name}-${each.value}-HeapMemoryAfterGC"
comparison_operator = var.settings.HeapMemoryAfterGC.comparison_operator
evaluation_periods = var.settings.HeapMemoryAfterGC.evaluation_periods
metric_name = "HeapMemoryAfterGC"
period = var.settings.HeapMemoryAfterGC.period
statistic = var.settings.HeapMemoryAfterGC.statistic
threshold = var.settings.HeapMemoryAfterGC.threshold
alarm_description = "Kafka:HeapMemoryAfterGC"
namespace = "AWS/Kafka"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.HeapMemoryAfterGC.action]
ok_actions = [var.settings.HeapMemoryAfterGC.action]
dimensions = {
"Cluster Name" = var.cluster-name
"Broker ID" = each.value
}
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.36.1"
}
}
}
@@ -0,0 +1,4 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable cluster-name {}
variable settings {}
@@ -0,0 +1,26 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
## Example
```terraform
module "ngw" {
source = "../../modules/util/resource-list"
resource-type = "ngw"
}
module "ngw-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.ngw.result-set
source = "../../modules/ManagementGovernance/Monitoring.NGW"
default-tags = local.default-tags
job-flow-id = split("/", each.value)[1]
threshold-ErrorPortAllocation = 2
threshold-ConnectionEstablishedCount = 1000
threshold-PacketsDropCount = 10
actions-enabled = var.actions-enabled
sns-targets = var.sns-targets
}
```
@@ -0,0 +1,19 @@
resource "aws_cloudwatch_metric_alarm" "ngw-alarms" {
for_each = var.settings
alarm_name = "${each.value["ecccode"]}-NGW_${var.res-id}-${each.value["metric"]}"
comparison_operator = each.value["comparison_operator"]
evaluation_periods = each.value["evaluation_periods"]
metric_name = each.value["metric"]
period = each.value["period"]
statistic = each.value["statistic"]
threshold = each.value["threshold"]
alarm_description = "NGW:${each.value["metric"]}"
namespace = "AWS/NATGateway"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [each.value["action"]]
ok_actions = [each.value["action"]]
dimensions = {
NatGatewayId = var.res-id
}
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.36.1"
}
}
}
@@ -0,0 +1,4 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable res-id {}
variable settings {}
@@ -0,0 +1,24 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
## Example
```terraform
module "nlb-arns" {
source = "../../modules/util/resource-list"
resource-type = "nlb"
}
module "nlb-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.nlb-arns.result-set
source = "../../modules/ManagementGovernance/Monitoring.NLB"
default-tags = local.default-tags
load-balancer = each.value
threshold-HealthHostCountMin = 1
actions-enabled = var.actions-enabled
sns-targets = var.sns-targets
}
```
@@ -0,0 +1,105 @@
/*
data "external" "nlb-targetgroups" {
program = ["bash", "${path.module}/list-nlb-targetgroups.sh"]
query = {
parameter = var.load-balancer
}
}
*/
locals {
nlb-name = "net/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}"
}
resource "aws_cloudwatch_metric_alarm" "nlb-TCP_Target_Reset_Count" {
alarm_name = "${var.settings.TCP_Target_Reset_Count.ecccode}-NLB_${local.nlb-name}-TCP_Target_Reset_Count"
comparison_operator = var.settings.TCP_Target_Reset_Count.comparison_operator
evaluation_periods = var.settings.TCP_Target_Reset_Count.evaluation_periods
metric_name = "TCP_Target_Reset_Count"
period = var.settings.TCP_Target_Reset_Count.period
statistic = var.settings.TCP_Target_Reset_Count.statistic
threshold = var.settings.TCP_Target_Reset_Count.threshold
alarm_description = "NLB:TCP_Target_Reset_Count"
namespace = "AWS/NetworkELB"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.TCP_Target_Reset_Count.action]
ok_actions = [var.settings.TCP_Target_Reset_Count.action]
dimensions = {
LoadBalancer = local.nlb-name
}
}
/*
module "nlb-targetgroups" {
source = "../../util/resource-list"
resource-type = "nlb-targetgroups"
query-input = var.load-balancer
asrolearn = var.asrolearn
}
*/
// causes Rate exceeded error, maybe because of adaptive AWS_RETRY_MODE?
/*
module "nlb_tgs" {
assume_role_arn = var.asrolearn
role_session_name = "terraform-resource-list"
source = "../../util/terraform-aws-cli"
aws_cli_commands = ["elbv2", "describe-target-groups", "--load-balancer-arn", var.load-balancer]
aws_cli_query = "TargetGroups[*].TargetGroupArn"
}
*/
module nlb_tgs {
source = "../../util/awscli"
access_key = var.target-account-ak
aws_cli_commands = "elbv2 describe-target-groups --load-balancer-arn ${var.load-balancer} --query TargetGroups[*].TargetGroupArn"
secret_key = var.target-account-sk
session_token = var.target-account-token
}
resource "aws_cloudwatch_metric_alarm" "nlb-HealthyHostCount" {
# for_each = module.nlb-targetgroups.result-set
for_each = toset(module.nlb_tgs.awscliout)
alarm_name = "${var.settings.HealthHostCountMin.ecccode}-NLBTG_${split(":", each.value)[5]}-HealthyHostCount"
comparison_operator = var.settings.HealthHostCountMin.comparison_operator
evaluation_periods = var.settings.HealthHostCountMin.evaluation_periods
metric_name = "HealthyHostCount"
period = var.settings.HealthHostCountMin.period
statistic = var.settings.HealthHostCountMin.statistic
threshold = var.settings.HealthHostCountMin.threshold
alarm_description = "NLBTG:HealthyHostCount"
namespace = "AWS/NetworkELB"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.HealthHostCountMin.action]
ok_actions = [var.settings.HealthHostCountMin.action]
dimensions = {
TargetGroup = split(":", each.value)[5]
LoadBalancer = "net/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}"
}
}
resource "aws_cloudwatch_metric_alarm" "nlb-UnHealthyHostCount" {
# for_each = module.nlb-targetgroups.result-set
for_each = toset(module.nlb_tgs.awscliout)
alarm_name = "${var.settings.UnHealthyHostCount.ecccode}-NLBTG_${split(":", each.value)[5]}-UnHealthyHostCount"
comparison_operator = var.settings.UnHealthyHostCount.comparison_operator
evaluation_periods = var.settings.UnHealthyHostCount.evaluation_periods
metric_name = "UnHealthyHostCount"
period = var.settings.UnHealthyHostCount.period
statistic = var.settings.UnHealthyHostCount.statistic
threshold = var.settings.UnHealthyHostCount.threshold
alarm_description = "NLBTG:UnHealthyHostCount"
namespace = "AWS/NetworkELB"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [var.settings.UnHealthyHostCount.action]
ok_actions = [var.settings.UnHealthyHostCount.action]
dimensions = {
TargetGroup = split(":", each.value)[5]
LoadBalancer = "net/${split("/", var.load-balancer)[2]}/${split("/", var.load-balancer)[3]}"
}
}
@@ -0,0 +1,4 @@
output nlb-tg-count {
# value = length(module.nlb-targetgroups.result-set)
value = length(flatten(module.nlb_tgs.awscliout))
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.36.1"
}
}
}
@@ -0,0 +1,8 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable load-balancer {}
variable settings {}
# variable asrolearn {}
variable target-account-ak {}
variable target-account-sk {}
variable target-account-token {}
@@ -0,0 +1,27 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
## Example
```terraform
module "es-domains" {
source = "../../modules/util/resource-list"
resource-type = "opensearch"
}
module "es-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.es-domains.result-set
source = "../../modules/ManagementGovernance/Monitoring.OpenSearch"
default-tags = local.default-tags
domain-name = each.value
threshold-CPUUtilization = 90
threshold-IndexingLatency = 3
threshold-SearchLatency = 3
# threshold-KibanaHealthyNodes = 1
actions-enabled = var.actions-enabled
sns-targets = var.sns-targets
}
```
@@ -0,0 +1,22 @@
data "aws_caller_identity" "this" {}
resource "aws_cloudwatch_metric_alarm" "ES-alarms" {
for_each = var.settings
alarm_name = "${each.value["ecccode"]}-ES_${var.domain-name}-${each.value["metric"]}"
comparison_operator = each.value["comparison_operator"]
evaluation_periods = each.value["evaluation_periods"]
metric_name = each.value["metric"]
period = each.value["period"]
statistic = each.value["statistic"]
threshold = each.value["threshold"]
alarm_description = "ES:${each.value["metric"]}"
namespace = "AWS/ES"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [each.value["action"]]
ok_actions = [each.value["action"]]
dimensions = {
DomainName = var.domain-name
ClientId = data.aws_caller_identity.this.id
}
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.36.1"
}
}
}
@@ -0,0 +1,4 @@
variable "cw-alarm-prefix" {}
variable "actions-enabled" {}
variable "domain-name" {}
variable "settings" {}
@@ -0,0 +1,31 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
AWS provider 4.47.0 or above is needed for datasource aws_db_instances (https://github.com/hashicorp/terraform-provider-aws/blob/main/CHANGELOG.md)
## Example
```terraform
module "rds-instances" {
source = "../../modules/util/resource-list"
resource-type = "rds"
}
module "rds-monitoring" {
# for_each = toset(var.rds-instance-ids)
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.rds-instances.result-set
source = "../../modules/ManagementGovernance/Monitoring.RDS"
default-tags = local.default-tags
rds-instance-name = each.value
threshold-CpuUtilization = 90
threshold-FreeableMemory = 512 * 1024 * 1024
threshold-FreeStorageSpace = 5 * 1024 * 1024 * 1024
threshold-DiskQueueDepth = 30
threshold-ReadLatency = 0.03
threshold-WriteLatency = 0.03
actions-enabled = var.actions-enabled
sns-targets = var.sns-targets
}
```
@@ -0,0 +1,19 @@
resource "aws_cloudwatch_metric_alarm" "rds-alarms" {
for_each = var.settings
alarm_name = "${each.value["ecccode"]}-RDS_${var.rds-instance-name}-${each.value["metric"]}"
comparison_operator = each.value["comparison_operator"]
evaluation_periods = each.value["evaluation_periods"]
metric_name = each.value["metric"]
period = each.value["period"]
statistic = each.value["statistic"]
threshold = each.value["threshold"]
alarm_description = "RDS:${each.value["metric"]}"
namespace = "AWS/RDS"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [each.value["action"]]
ok_actions = [each.value["action"]]
dimensions = {
DBInstanceIdentifier = var.rds-instance-name
}
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.47.0"
}
}
}
@@ -0,0 +1,4 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable rds-instance-name {}
variable settings {}
@@ -0,0 +1,26 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
## Example
```terraform
module "redis-instances" {
source = "../../modules/util/resource-list"
resource-type = "redis"
}
module "redis-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.redis-instances.result-set
source = "../../modules/ManagementGovernance/Monitoring.Redis"
default-tags = local.default-tags
redis-cluster-id = each.value
threshold-EngineCPUUtilization = 90
threshold-DatabaseMemoryUsagePercentage = 90
threshold-CacheHitRate = 3
actions-enabled = var.actions-enabled
sns-targets = var.sns-targets
}
```
@@ -0,0 +1,21 @@
resource "aws_cloudwatch_metric_alarm" "redis-alarms" {
for_each = var.settings
alarm_name = "${each.value["ecccode"]}-Redis_${var.redis-cluster-id}-${each.value["metric"]}"
comparison_operator = each.value["comparison_operator"]
evaluation_periods = each.value["evaluation_periods"]
metric_name = each.value["metric"]
period = each.value["period"]
statistic = each.value["statistic"]
threshold = each.value["threshold"]
alarm_description = "ElastiCache:${each.value["metric"]}"
namespace = "AWS/ElastiCache"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [each.value["action"]]
ok_actions = [each.value["action"]]
treat_missing_data = "notBreaching"
dimensions = {
CacheClusterId = var.redis-cluster-id
}
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.36.1"
}
}
}
@@ -0,0 +1,4 @@
variable "cw-alarm-prefix" {}
variable "actions-enabled" {}
variable "redis-cluster-id" {}
variable "settings" {}
@@ -0,0 +1,24 @@
# Monitoring module
This module deploys the default cloudwatch metric monitoring
## Notes
Terraform lifecycle ignores tags to speed up terraform subsequent update. Cloudwatch alarm tags cannot be read on aws console anyway.
## Example
```terraform
module "tgw" {
source = "../../modules/util/resource-list"
resource-type = "tgw"
}
module "tgw-monitoring" {
cw-alarm-prefix = local.cw-alarm-prefix
for_each = module.tgw.result-set
source = "../../modules/ManagementGovernance/Monitoring.TGW"
default-tags = local.default-tags
job-flow-id = split("/", each.value)[1]
threshold-PacketDropCountNoRoute = 1
actions-enabled = var.actions-enabled
sns-targets = var.sns-targets
}
```
@@ -0,0 +1,19 @@
resource "aws_cloudwatch_metric_alarm" "tgw-PacketDropCountNoRoute" {
for_each = var.settings
alarm_name = "${each.value["ecccode"]}-TGW_${var.tgw-id}-PacketDropCountNoRoute"
comparison_operator = each.value["comparison_operator"]
evaluation_periods = each.value["evaluation_periods"]
metric_name = each.value["metric"]
period = each.value["period"]
statistic = each.value["statistic"]
threshold = each.value["threshold"]
alarm_description = "TGW:${each.value["metric"]}"
namespace = "AWS/TransitGateway"
insufficient_data_actions = []
actions_enabled = var.actions-enabled
alarm_actions = [each.value["action"]]
ok_actions = [each.value["action"]]
dimensions = {
TransitGateway = var.tgw-id
}
}
@@ -0,0 +1,9 @@
terraform {
required_version = "~> 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.47.0"
}
}
}
@@ -0,0 +1,4 @@
variable cw-alarm-prefix {}
variable actions-enabled {}
variable tgw-id {}
variable settings {}
@@ -0,0 +1,47 @@
<!-- This readme file is generated with terraform-docs -->
## Requirements
| Name | Version |
|------|---------|
| terraform | >= 1.3.0 |
| aws | >= 5.0 |
## Providers
| Name | Version |
|------|---------|
| aws | >= 5.0 |
## Modules
No modules.
## Resources
| Name | Type |
|------|------|
| [aws_sns_topic.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic) | resource |
| [aws_sns_topic_subscription.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic_subscription) | resource |
| [aws_caller_identity.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
| [aws_region.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source |
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| email-addresses | Email recipients of SNS notifications | `set(string)` | n/a | yes |
| kms-key-id | KMS key id for SNS topic at-rest encryption. Make sure the sender has access to this key | `string` | n/a | yes |
| sender | ARN of SNS sender or sending service name | `string` | n/a | yes |
| sender-type | Sender principal type. Value should be either *AWS* or *Service* | `string` | n/a | yes |
| sns-topic-description | SNS topic display name | `string` | n/a | yes |
| sns-topic-name | Name of SNS topic | `string` | n/a | yes |
## Outputs
| Name | Description |
|------|-------------|
| sns-topic-arn | n/a |
---
## Authorship
This module was developed by xpk.
@@ -0,0 +1,69 @@
data "aws_caller_identity" "this" {}
data "aws_region" "this" {}
resource "aws_sns_topic" "this" {
name = var.sns-topic-name
display_name = var.sns-topic-description
kms_master_key_id = var.kms-key-id
policy = jsonencode(
{
"Version" : "2008-10-17",
"Id" : "SnsTopicPolicy",
"Statement" : [
{
"Sid" : "SnsTopicAdmin",
"Effect" : "Allow",
"Principal" : {
"AWS" : data.aws_caller_identity.this.account_id
},
"Action" : [
"SNS:GetTopicAttributes",
"SNS:SetTopicAttributes",
"SNS:AddPermission",
"SNS:RemovePermission",
"SNS:DeleteTopic",
"SNS:Subscribe",
"SNS:ListSubscriptionsByTopic",
"SNS:Publish",
"SNS:Receive"
],
"Resource" : "arn:aws:sns:${data.aws_region.this.name}:${data.aws_caller_identity.this.account_id}:${var.sns-topic-name}",
"Condition" : {
"StringEquals" : {
"AWS:SourceOwner" : data.aws_caller_identity.this.account_id
}
}
},
{
"Sid" : "AllowPublishing",
"Effect" : "Allow",
"Principal" : {
"${var.sender-type}" : var.sender
},
"Action" : "sns:Publish",
"Resource" : "arn:aws:sns:${data.aws_region.this.name}:${data.aws_caller_identity.this.account_id}:${var.sns-topic-name}"
},
{
"Sid" : "AllowPublishThroughSSLOnly",
"Action" : "SNS:Publish",
"Effect" : "Deny",
"Resource" : "arn:aws:sns:${data.aws_region.this.name}:${data.aws_caller_identity.this.account_id}:${var.sns-topic-name}",
"Condition" : {
"Bool" : {
"aws:SecureTransport" : "false"
}
},
"Principal" : "*"
}
]
}
)
}
resource "aws_sns_topic_subscription" "this" {
for_each = var.email-addresses
topic_arn = aws_sns_topic.this.arn
protocol = "email"
endpoint = each.value
}
@@ -0,0 +1,3 @@
output "sns-topic-arn" {
value = aws_sns_topic.this.arn
}
@@ -0,0 +1,33 @@
variable "sender" {
type = string
description = "ARN of SNS sender or sending service name"
}
variable "sender-type" {
type = string
description = "Sender principal type. Value should be either *AWS* or *Service*"
validation {
condition = var.sender-type == "AWS" || var.sender-type == "Service"
error_message = "Valid values are AWS or Service"
}
}
variable "sns-topic-name" {
type = string
description = "Name of SNS topic"
}
variable "sns-topic-description" {
type = string
description = "SNS topic display name"
}
variable "kms-key-id" {
type = string
description = "KMS key id for SNS topic at-rest encryption. Make sure the sender has access to this key"
}
variable "email-addresses" {
type = set(string)
description = "Email recipients of SNS notifications"
}
@@ -0,0 +1,9 @@
terraform {
required_version = ">= 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.0"
}
}
}
@@ -0,0 +1,101 @@
ACM sends daily expiration events for all active certificates (public, private and imported) starting 45 days prior to expiration [1].
This module sets up event rule and sns notification. Deliver email notifications for expiring certificates, useful for imported certificates.
## Notes
* DaysToExpiry cannot be greater than 45
```bash
aws acm put-account-configuration --idempotency-token abcd123456 --expiry-events DaysBeforeExpiry=46 --region=ap-east-1
An error occurred (ValidationException) when calling the PutAccountConfiguration operation: Days before expiry cannot be over 45.
```
* KMS key for SNS must allow events.amazonaws.com. Check that this statement is present in the KMS key policy. Otherwise you will get FailedInvocation in event rule graph and there is no other debug info. The default alias/aws/sns managed key does not allow encryption / decryption from cloudwatch or events [2].
```json
{
"Sid": "Allow publish from events",
"Effect": "Allow",
"Principal": {
"Service": "events.amazonaws.com"
},
"Action": [
"kms:Encrypt",
"kms:Decrypt",
"kms:ReEncrypt*",
"kms:GenerateDataKey*",
"kms:DescribeKey"
],
"Resource": "*"
}
```
[1] https://docs.aws.amazon.com/acm/latest/userguide/supported-events.html
[2] https://docs.gruntwork.io/discussions/knowledge-base/238/
## Sample Event bridge event
```json
{
"version": "0",
"id": "id",
"detail-type": "ACM Certificate Approaching Expiration",
"source": "aws.acm",
"account": "account",
"time": "2020-09-30T06:51:08Z",
"region": "region",
"resources": [
"arn:aws:acm:region:account:certificate/certificate_ID"
],
"detail": {
"DaysToExpiry": 31,
"CommonName": "example.com"
}
}
```
## Requirements
| Name | Version |
|------|---------|
| terraform | >= 1.3.0 |
| aws | >= 5.0 |
## Providers
| Name | Version |
|------|---------|
| aws | >= 5.0 |
| random | n/a |
## Modules
| Name | Source | Version |
|------|--------|---------|
| awscli | ../../util/terraform-aws-cli | n/a |
## Resources
| Name | Type |
|------|------|
| [aws_cloudwatch_event_rule.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource |
| [aws_cloudwatch_event_target.sns](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource |
| [aws_sns_topic.ssl-cert-expiry-notice](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic) | resource |
| [aws_sns_topic_policy.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic_policy) | resource |
| [aws_sns_topic_subscription.ssl-cert-expiry-notice-sub](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic_subscription) | resource |
| [random_id.this](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource |
| [aws_caller_identity.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
| [aws_iam_policy_document.sns_topic_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
## Inputs
| Name | Description | Type | Default | Required |
|------|----------------------------------------------------------------------------------|------|---------|:--------:|
| days-before-expiry | ACM DaysBeforeExpiry account configuration | `number` | `45` | no |
| email-addresses | Set of email addresses to receive SNS notifications | `set(string)` | n/a | yes |
| res-prefix | Resource name prefix | `string` | `"aws"` | no |
| sns-kms-key-arn | ARN of KMS key used for SNS encryption. This key must allow events.amazonaws.com | `string` | `null` | no |
## Outputs
No outputs.
---
## Authorship
This module was developed by xpk.
@@ -0,0 +1,98 @@
data "aws_caller_identity" "this" {}
resource "random_id" "this" {
byte_length = 2
}
resource "aws_cloudwatch_event_rule" "this" {
name = "${var.res-prefix}-ssl-cert-expiry-${random_id.this.dec}"
description = "Reminder of SSL expiring certificates"
event_pattern = jsonencode({
"source" : ["aws.acm"],
"detail-type" : ["ACM Certificate Approaching Expiration"]
})
}
resource "aws_cloudwatch_event_target" "sns" {
rule = aws_cloudwatch_event_rule.this.name
target_id = "ssl-cert-expiry-sns-${random_id.this.dec}"
arn = aws_sns_topic.ssl-cert-expiry-notice.arn
input_transformer {
input_paths = {
"cert" : "$.resources[0]",
"days" : "$.detail.DaysToExpiry",
"cn" : "$.detail.CommonName"
}
input_template = <<-EOT
"The following ACM certificate will expire soon"
"ID: <cert>"
"CommonName: <cn>"
"Days to expiry: <days>"
EOT
}
}
# Modify ACM DaysBeforeExpiry account setting if it should be set lower than the default 45 days
module "awscli" {
count = var.days-before-expiry < 45 ? 1 : 0
source = "../../util/terraform-aws-cli"
role_session_name = "terraform-awscli"
aws_cli_commands = ["acm", "put-account-configuration", "--idempotency-token", random_id.this.dec, "--expiry-events DaysBeforeExpiry=${var.days-before-expiry}"]
}
# SNS topic and subscription
resource "aws_sns_topic" "ssl-cert-expiry-notice" {
name = "${var.res-prefix}-ssl-cert-expiry-notice-${random_id.this.dec}"
kms_master_key_id = var.sns-kms-key-arn
}
resource "aws_sns_topic_policy" "default" {
arn = aws_sns_topic.ssl-cert-expiry-notice.arn
policy = data.aws_iam_policy_document.sns_topic_policy.json
}
data "aws_iam_policy_document" "sns_topic_policy" {
statement {
sid = "AllowPublishingFromEvents"
effect = "Allow"
actions = [
"sns:Publish",
"SNS:Publish"
]
principals {
type = "Service"
identifiers = ["events.amazonaws.com"]
}
resources = [aws_sns_topic.ssl-cert-expiry-notice.arn]
}
statement {
sid = "AllowPublishThroughSSLOnly"
effect = "Deny"
principals {
identifiers = ["*"]
type = "AWS"
}
actions = [
"sns:Publish",
"SNS:Publish"
]
condition {
test = "Bool"
values = ["false"]
variable = "aws:SecureTransport"
}
resources = [aws_sns_topic.ssl-cert-expiry-notice.arn]
}
}
resource "aws_sns_topic_subscription" "ssl-cert-expiry-notice-sub" {
for_each = var.email-addresses
topic_arn = aws_sns_topic.ssl-cert-expiry-notice.arn
protocol = "email"
endpoint = each.value
}
@@ -0,0 +1,22 @@
variable "email-addresses" {
type = set(string)
description = "Set of email addresses to receive SNS notifications"
}
variable "days-before-expiry" {
type = number
description = "ACM DaysBeforeExpiry account configuration"
default = 45
}
variable "res-prefix" {
type = string
description = "Resource name prefix"
default = "aws"
}
variable "sns-kms-key-arn" {
type = string
description = "ARN of KMS key used for SNS encryption. This key must allow events.amazonaws.com"
default = null
}
@@ -0,0 +1,9 @@
terraform {
required_version = ">= 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.0"
}
}
}
@@ -0,0 +1,44 @@
<!-- This readme file is generated with terraform-docs -->
## Requirements
| Name | Version |
|------|---------|
| terraform | >= 1.3.0 |
| aws | >= 5.0 |
## Providers
| Name | Version |
|------|---------|
| aws | >= 5.0 |
## Modules
No modules.
## Resources
| Name | Type |
|------|------|
| [aws_cloudwatch_log_group.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
| [aws_ssm_maintenance_window.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_maintenance_window) | resource |
| [aws_ssm_maintenance_window_target.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_maintenance_window_target) | resource |
| [aws_ssm_maintenance_window_task.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_maintenance_window_task) | resource |
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| cron-expression | Cron expression for SSM maintenance window schedule | `string` | n/a | yes |
| description | Description of command to run | `string` | n/a | yes |
| instance-id | Id of Ec2 instance to execute the script | `string` | n/a | yes |
| schedule-name | Name of maintenance window. e.g. Daily0900UTC8 | `string` | n/a | yes |
| shell-script-path | Full path to script | `string` | n/a | yes |
## Outputs
No outputs.
---
## Authorship
This module was developed by xpk.
@@ -0,0 +1,80 @@
# SSM run command
#resource "aws_ssm_document" "this" {
# name = replace(title(var.description), " ", "")
# document_type = "Command"
# target_type = "/AWS::EC2::Instance"
# content = jsonencode(
# {
# "schemaVersion" : "2.2",
# "description" : "Run script for ${var.description}",
# "parameters" : {
# },
# "mainSteps" : [
# {
# "action" : "aws:runShellScript",
# "name" : "RunShellScript",
# "inputs" : {
# "runCommand" : var.shell-script-path
# }
# }
# ]
# }
# )
#}
resource "aws_ssm_maintenance_window" "this" {
name = replace(title(var.description), " ", "")
description = var.description
schedule = var.cron-expression
duration = var.maintenance-window-duration
cutoff = 1
}
resource "aws_ssm_maintenance_window_target" "this" {
window_id = aws_ssm_maintenance_window.this.id
name = replace(title(var.description), " ", "")
description = var.description
resource_type = "INSTANCE"
targets {
key = "InstanceIds"
values = [var.instance-id]
}
}
resource "aws_ssm_maintenance_window_task" "this" {
name = replace(title(var.description), " ", "")
max_concurrency = 1
max_errors = 1
priority = 1
task_arn = "AWS-RunShellScript"
task_type = "RUN_COMMAND"
window_id = aws_ssm_maintenance_window.this.id
targets {
key = "InstanceIds"
values = [var.instance-id]
}
task_invocation_parameters {
run_command_parameters {
timeout_seconds = 60 # If this time is reached and the command has not already started executing, it doesn't run.
cloudwatch_config {
cloudwatch_log_group_name = aws_cloudwatch_log_group.this.name
cloudwatch_output_enabled = true
}
parameter {
name = "commands"
values = [var.shell-script-path]
}
}
}
}
resource "aws_cloudwatch_log_group" "this" {
name = "/aws/ssm-maintenance/${replace(title(var.description), " ", "")}"
retention_in_days = var.cloudwatch-log-retention-days
log_group_class = "STANDARD" # infrequent access logs can only be viewed via insight
}
@@ -0,0 +1,36 @@
variable shell-script-path {
type = string
description = "Full path to script"
}
variable cron-expression {
type = string
description = "Cron expression for SSM maintenance window schedule"
}
variable instance-id {
type = string
description = "Id of Ec2 instance to execute the script"
}
variable description {
type = string
description = "Description of command to run"
}
variable schedule-name {
type = string
description = "Name of maintenance window. e.g. Daily0900UTC8"
}
variable maintenance-window-duration {
type = number
description = "Duration of maintenance window, must be >= 2"
default = 2
}
variable cloudwatch-log-retention-days {
type = number
description = "Days to retain logs on cloudwatch logs"
default = 30
}
@@ -0,0 +1,9 @@
terraform {
required_version = ">= 1.3.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.0"
}
}
}