initial commit
This commit is contained in:
+17
@@ -0,0 +1,17 @@
|
||||
*.tfstate.backup
|
||||
*.backup
|
||||
*.tfstate
|
||||
*.tfstate.lock
|
||||
**/*.tfstate
|
||||
**/*.backup
|
||||
.terraform/
|
||||
.DS_Store
|
||||
*.iml
|
||||
.idea
|
||||
.terraform.lock.hcl
|
||||
*.log
|
||||
examples/
|
||||
experimental/
|
||||
headdesk-aws/
|
||||
vsphere-yige/
|
||||
anz-sandbox/
|
||||
@@ -0,0 +1,16 @@
|
||||
def lambda_handler(event, context):
|
||||
# Extract query parameters from the event
|
||||
params = event.get('queryStringParameters', {})
|
||||
|
||||
# Print all query parameters
|
||||
print("Received query parameters:", params)
|
||||
|
||||
# Example: If you want to print a specific parameter, e.g., 'param1'
|
||||
if params and 'inputValue' in params:
|
||||
print("Value of 'inputValue':", params['inputValue'])
|
||||
|
||||
# You can return the input parameters as response if needed
|
||||
return {
|
||||
'statusCode': 200,
|
||||
'body': f"Received parameters: {params}"
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
<!-- This readme file is generated with terraform-docs -->
|
||||
# ApigwAuthSample
|
||||
A working example which deploys HTTP api, Lambda functions, and necessary permissions.
|
||||
|
||||
## Testing the API
|
||||
To test this in postman, put in the following settings:
|
||||
|
||||
URL: https://<api-id>.execute-api.ap-east-1.amazonaws.com/?inputValue=TestMessage123
|
||||
Authorization: api key, key = Authorizations, value = sha256 hash, add to = Header
|
||||
|
||||
## Requirements
|
||||
|
||||
| Name | Version |
|
||||
|------|---------|
|
||||
| terraform | ~> 1.13.0 |
|
||||
| aws | ~> 5.0 |
|
||||
|
||||
## Providers
|
||||
|
||||
| Name | Version |
|
||||
|------|---------|
|
||||
| archive | 2.7.1 |
|
||||
| aws | 5.100.0 |
|
||||
| random | 3.7.2 |
|
||||
|
||||
## Modules
|
||||
|
||||
No modules.
|
||||
|
||||
## Resources
|
||||
|
||||
| Name | Type |
|
||||
|------|------|
|
||||
| [aws_apigatewayv2_api.SampleHttpApi](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_api) | resource |
|
||||
| [aws_apigatewayv2_deployment.deployment](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_deployment) | resource |
|
||||
| [aws_apigatewayv2_stage.stage1](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/apigatewayv2_stage) | resource |
|
||||
| [aws_cloudwatch_log_group.api_logging](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
|
||||
| [aws_cloudwatch_log_group.loggroups](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
|
||||
| [aws_iam_role.role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
|
||||
| [aws_iam_role_policy_attachment.role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
|
||||
| [aws_lambda_function.EchoFunction](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource |
|
||||
| [aws_lambda_function.SampleAuthorizer](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource |
|
||||
| [aws_lambda_permission.EchoFunction](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource |
|
||||
| [aws_lambda_permission.SampleAuthorizer](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource |
|
||||
| [random_password.pw](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource |
|
||||
| [archive_file.EchoFunction](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source |
|
||||
| [archive_file.SampleAuthorizer](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source |
|
||||
| [aws_caller_identity.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
|
||||
| [aws_iam_policy_document.lambda_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
|
||||
|
||||
## Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|------|---------|:--------:|
|
||||
| DynamicAddressGroup | n/a | `any` | n/a | yes |
|
||||
| application | n/a | `any` | n/a | yes |
|
||||
| aws-region | n/a | `any` | n/a | yes |
|
||||
| costcenter | n/a | `any` | n/a | yes |
|
||||
| customer-name | n/a | `any` | n/a | yes |
|
||||
| environment | n/a | `any` | n/a | yes |
|
||||
| owner | n/a | `any` | n/a | yes |
|
||||
| project | n/a | `any` | n/a | yes |
|
||||
|
||||
## Outputs
|
||||
|
||||
| Name | Description |
|
||||
|------|-------------|
|
||||
| api\_deployment\_id | n/a |
|
||||
| api\_endpoint | n/a |
|
||||
| last-updated | n/a |
|
||||
|
||||
---
|
||||
## Authorship
|
||||
This module was developed by Rackspace.
|
||||
@@ -0,0 +1,55 @@
|
||||
import hashlib
|
||||
import os
|
||||
|
||||
|
||||
#region = os.environ['region']
|
||||
#account_id = os.environ['account_id']
|
||||
#api_id = os.environ['api_id']
|
||||
pw_hash = os.environ['pw_hash']
|
||||
#resource_arn = f"arn:aws:execute-api:{region}:{account_id}:{api_id}:/*/*/" # based on observed routeArn in event
|
||||
|
||||
def lambda_handler(event, context):
|
||||
# debug
|
||||
# print(f"Event received: {event}")
|
||||
# print(f"resource_arn: {resource_arn}")
|
||||
|
||||
# Extract the token from headers
|
||||
token = event['headers'].get('authorization', '')
|
||||
|
||||
# Check token validity
|
||||
is_authorized = token == pw_hash
|
||||
|
||||
# Log for debugging
|
||||
print(f"Authorization status: {is_authorized}. Authorization token: {'*' * len(token)}")
|
||||
|
||||
# Simple response
|
||||
return {
|
||||
"isAuthorized" : is_authorized
|
||||
}
|
||||
|
||||
# IAM policy response, which is overkilled with no added benefit
|
||||
# to use IAM policy response, your api needs to have "enableSimpleResponses" : false
|
||||
# if is_authorized:
|
||||
# return {
|
||||
# "principalId" : "demo",
|
||||
# "policyDocument": {
|
||||
# "Version": "2012-10-17",
|
||||
# "Statement": [{
|
||||
# "Action": "execute-api:Invoke",
|
||||
# "Effect": "Allow",
|
||||
# "Resource": event["routeArn"]
|
||||
# }]
|
||||
# }
|
||||
# }
|
||||
# else:
|
||||
# return {
|
||||
# "principalId" : "demo",
|
||||
# "policyDocument": {
|
||||
# "Version": "2012-10-17",
|
||||
# "Statement": [{
|
||||
# "Action": "*",
|
||||
# "Effect": "Deny",
|
||||
# "Resource": "*"
|
||||
# }]
|
||||
# }
|
||||
# }
|
||||
@@ -0,0 +1,43 @@
|
||||
{
|
||||
"openapi" : "3.0.1",
|
||||
|
||||
"paths" : {
|
||||
"/" : {
|
||||
"get" : {
|
||||
"responses" : {
|
||||
"default" : {
|
||||
"description" : "Default response for GET /"
|
||||
}
|
||||
},
|
||||
"security" : [ {
|
||||
"SampleAuthorizer" : [ ]
|
||||
} ],
|
||||
"x-amazon-apigateway-integration" : {
|
||||
"payloadFormatVersion" : "2.0",
|
||||
"type" : "aws_proxy",
|
||||
"httpMethod" : "POST",
|
||||
"uri" : "arn:aws:apigateway:ap-east-1:lambda:path/2015-03-31/functions/arn:aws:lambda:ap-east-1:040216112220:function:EchoFunction/invocations",
|
||||
"connectionType" : "INTERNET"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"components" : {
|
||||
"securitySchemes" : {
|
||||
"SampleAuthorizer" : {
|
||||
"type" : "apiKey",
|
||||
"name" : "Authorization",
|
||||
"in" : "header",
|
||||
"x-amazon-apigateway-authorizer" : {
|
||||
"identitySource" : "$request.header.Authorization",
|
||||
"authorizerUri" : "arn:aws:apigateway:ap-east-1:lambda:path/2015-03-31/functions/arn:aws:lambda:ap-east-1:040216112220:function:SampleAuthorizer/invocations",
|
||||
"authorizerPayloadFormatVersion" : "2.0",
|
||||
"authorizerResultTtlInSeconds" : 0,
|
||||
"type" : "request",
|
||||
"enableSimpleResponses" : true
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"x-amazon-apigateway-importexport-version" : "1.0"
|
||||
}
|
||||
@@ -0,0 +1,170 @@
|
||||
/**
|
||||
* # ApigwAuthSample
|
||||
* A working example which deploys HTTP api, Lambda functions, and necessary permissions.
|
||||
*
|
||||
*
|
||||
* ## Testing the API
|
||||
* To test this in postman, put in the following settings:
|
||||
*
|
||||
* URL: https://<api-id>.execute-api.ap-east-1.amazonaws.com/?inputValue=TestMessage123
|
||||
* Authorization: api key, key = Authorizations, value = sha256 hash, add to = Header
|
||||
*
|
||||
*/
|
||||
|
||||
# IAM role for Lambda execution
|
||||
data "aws_iam_policy_document" "lambda_role" {
|
||||
statement {
|
||||
effect = "Allow"
|
||||
|
||||
principals {
|
||||
type = "Service"
|
||||
identifiers = ["lambda.amazonaws.com"]
|
||||
}
|
||||
|
||||
actions = ["sts:AssumeRole"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_iam_role" "role" {
|
||||
name = "ApiFunctionRole"
|
||||
assume_role_policy = data.aws_iam_policy_document.lambda_role.json
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "role" {
|
||||
role = aws_iam_role.role.name
|
||||
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
|
||||
}
|
||||
|
||||
data "archive_file" "EchoFunction" {
|
||||
type = "zip"
|
||||
source_file = "${path.module}/EchoFunction.py"
|
||||
output_path = "${path.module}/EchoFunction.zip"
|
||||
}
|
||||
|
||||
resource "aws_lambda_function" "EchoFunction" {
|
||||
filename = data.archive_file.EchoFunction.output_path
|
||||
function_name = "EchoFunction"
|
||||
description = "Function that echo query parameter inputValue"
|
||||
role = aws_iam_role.role.arn
|
||||
handler = "EchoFunction.lambda_handler"
|
||||
source_code_hash = data.archive_file.EchoFunction.output_base64sha256
|
||||
architectures = ["arm64"]
|
||||
|
||||
runtime = "python3.13"
|
||||
}
|
||||
|
||||
resource "aws_lambda_permission" "EchoFunction" {
|
||||
statement_id = "AllowExecutionFromApi"
|
||||
action = "lambda:InvokeFunction"
|
||||
function_name = aws_lambda_function.EchoFunction.function_name
|
||||
principal = "apigateway.amazonaws.com"
|
||||
source_arn = "arn:aws:execute-api:${var.aws-region}:${data.aws_caller_identity.this.account_id}:${aws_apigatewayv2_api.SampleHttpApi.id}/*/*"
|
||||
}
|
||||
|
||||
data "archive_file" "SampleAuthorizer" {
|
||||
type = "zip"
|
||||
source_file = "${path.module}/SampleAuthorizer.py"
|
||||
output_path = "${path.module}/SampleAuthorizer.zip"
|
||||
}
|
||||
|
||||
/* Test function with this input
|
||||
{
|
||||
"routeArn": "arn:aws:execute-api:ap-east-1:040216112220:wxzvfmiyd2/$default/GET/"
|
||||
"headers": {
|
||||
"authorization": "value of pw_hash"
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
resource "random_password" "pw" {
|
||||
length = 20
|
||||
min_upper = 2
|
||||
min_lower = 2
|
||||
min_numeric = 2
|
||||
min_special = 2
|
||||
}
|
||||
|
||||
resource "aws_lambda_function" "SampleAuthorizer" {
|
||||
filename = data.archive_file.SampleAuthorizer.output_path
|
||||
function_name = "SampleAuthorizer"
|
||||
description = "API authorizer"
|
||||
role = aws_iam_role.role.arn
|
||||
handler = "SampleAuthorizer.lambda_handler"
|
||||
source_code_hash = data.archive_file.SampleAuthorizer.output_base64sha256
|
||||
architectures = ["arm64"]
|
||||
runtime = "python3.13"
|
||||
|
||||
environment {
|
||||
variables = {
|
||||
region = var.aws-region
|
||||
account_id = data.aws_caller_identity.this.account_id
|
||||
api_id = aws_apigatewayv2_api.SampleHttpApi.id
|
||||
pw_hash = sha256(random_password.pw.result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_lambda_permission" "SampleAuthorizer" {
|
||||
statement_id = "AllowExecutionFromApi"
|
||||
action = "lambda:InvokeFunction"
|
||||
function_name = aws_lambda_function.SampleAuthorizer.function_name
|
||||
principal = "apigateway.amazonaws.com"
|
||||
source_arn = "arn:aws:execute-api:${var.aws-region}:${data.aws_caller_identity.this.account_id}:${aws_apigatewayv2_api.SampleHttpApi.id}/*/*"
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_log_group" "loggroups" {
|
||||
for_each = toset(["SampleAuthorizer", "EchoFunction"])
|
||||
name = "/aws/lambda/${each.value}"
|
||||
retention_in_days = 1
|
||||
}
|
||||
|
||||
# api
|
||||
resource "aws_apigatewayv2_api" "SampleHttpApi" {
|
||||
name = "SampleHttpApi"
|
||||
protocol_type = "HTTP"
|
||||
description = "Sample http api which uses Lambda integration"
|
||||
ip_address_type = "ipv4"
|
||||
body = file("api_body.json")
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_log_group" "api_logging" {
|
||||
name = "/aws/api/SampleHttpApi"
|
||||
retention_in_days = 1
|
||||
}
|
||||
|
||||
resource "aws_apigatewayv2_stage" "stage1" {
|
||||
api_id = aws_apigatewayv2_api.SampleHttpApi.id
|
||||
name = "$default"
|
||||
description = "Default environment"
|
||||
deployment_id = aws_apigatewayv2_deployment.deployment.id
|
||||
|
||||
access_log_settings {
|
||||
destination_arn = aws_cloudwatch_log_group.api_logging.arn
|
||||
format = jsonencode(
|
||||
{
|
||||
"requestId" : "$context.requestId",
|
||||
"ip" : "$context.identity.sourceIp",
|
||||
"requestTime" : "$context.requestTime",
|
||||
"httpMethod" : "$context.httpMethod",
|
||||
"routeKey" : "$context.routeKey",
|
||||
"status" : "$context.status",
|
||||
"protocol" : "$context.protocol",
|
||||
"responseLength" : "$context.responseLength",
|
||||
"AuthorizerError" : "$context.authorizer.error"
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_apigatewayv2_deployment" "deployment" {
|
||||
api_id = aws_apigatewayv2_api.SampleHttpApi.id
|
||||
description = "Triggered by terraform"
|
||||
|
||||
triggers = {
|
||||
redeployment = timestamp()
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
create_before_destroy = true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
output "api_endpoint" {
|
||||
value = aws_apigatewayv2_api.SampleHttpApi.api_endpoint
|
||||
}
|
||||
|
||||
output "api_deployment_id" {
|
||||
value = aws_apigatewayv2_deployment.deployment.id
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
provider "aws" {
|
||||
region = var.aws-region
|
||||
|
||||
default_tags {
|
||||
tags = {
|
||||
ServiceProvider = "RackspaceTechnology"
|
||||
Environment = var.environment
|
||||
Project = var.project
|
||||
Application = var.application
|
||||
TerraformMode = "managed"
|
||||
Owner = var.owner
|
||||
CostCenter = var.costcenter
|
||||
DynamicAddressGroup = var.DynamicAddressGroup
|
||||
TerraformDir = "${reverse(split("/", path.cwd))[1]}/${reverse(split("/", path.cwd))[0]}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output "last-updated" {
|
||||
value = timestamp()
|
||||
}
|
||||
|
||||
terraform {
|
||||
required_version = "~> 1.13.0"
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = "~> 5.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
aws-region = "ap-east-1"
|
||||
customer-name = "ken2026"
|
||||
environment = "lab"
|
||||
project = "iac"
|
||||
application = "api"
|
||||
costcenter = "undefined"
|
||||
DynamicAddressGroup = "undefined"
|
||||
owner = "ken2026"
|
||||
@@ -0,0 +1,10 @@
|
||||
variable "aws-region" {}
|
||||
variable "customer-name" {}
|
||||
variable "environment" {}
|
||||
variable "project" {}
|
||||
variable "application" {}
|
||||
variable "owner" {}
|
||||
variable "costcenter" {}
|
||||
variable "DynamicAddressGroup" {}
|
||||
|
||||
data "aws_caller_identity" "this" {}
|
||||
@@ -0,0 +1,8 @@
|
||||
# LambdaPyZip
|
||||
|
||||
This layer uses the ```python_aws_lambda``` data source, which creates zip archives with the following inputs
|
||||
- source/function.py
|
||||
- source/requirements.txt
|
||||
|
||||
Function.py contains the lambda handler, while requirements.txt states the dependencies. This datasource will run
|
||||
pip install and generate zip archives in the output directory.
|
||||
@@ -0,0 +1,27 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
python = {
|
||||
source = "ATenderholt/python"
|
||||
version = "0.9.2"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
provider "python" {
|
||||
pip_command = "pip3"
|
||||
}
|
||||
|
||||
data "python_aws_lambda" "example" {
|
||||
source_dir = "source"
|
||||
archive_path = "output/handler.zip"
|
||||
dependencies_path = "output/dependencies.zip"
|
||||
extra_args = "--only-binary=:all:"
|
||||
}
|
||||
|
||||
output lib_sum {
|
||||
value = data.python_aws_lambda.example.dependencies_base64sha256
|
||||
}
|
||||
|
||||
output function_sum {
|
||||
value = data.python_aws_lambda.example.archive_base64sha256
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
# reference: https://aws.amazon.com/premiumsupport/knowledge-center/start-stop-lambda-eventbridge/
|
||||
import requests
|
||||
|
||||
def lambda_handler(event, context):
|
||||
r = requests.get('https://ipinfo.io/')
|
||||
return {
|
||||
"HttpResponseCode": r.status_code
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
dnspython==2.7.0
|
||||
requests
|
||||
@@ -0,0 +1,7 @@
|
||||
# bea-adc
|
||||
Module to deploy network resources and ad connector for use with AWS SSO
|
||||
|
||||
## Input variables
|
||||
The variable adc-service-account-password needs to be supplied via environment variable. This prevents terraform
|
||||
from saving the password in tfstate or in the source code.
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
data "aws_caller_identity" "this" {}
|
||||
|
||||
locals {
|
||||
default-tags = merge({
|
||||
ServiceProvider = "None"
|
||||
Environment = var.environment
|
||||
Project = var.project
|
||||
Application = var.application
|
||||
TerraformMode = "managed"
|
||||
TerraformDir = trimprefix(path.cwd, "/my/work/xpk-git/")
|
||||
CreatedBy = data.aws_caller_identity.this.arn
|
||||
BuildDate = formatdate("YYYYMMDD", timestamp())
|
||||
})
|
||||
resource-prefix = "${var.environment}-${var.aws-region-short}-${var.customer-name}-${var.project}"
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
module "vpc-subnets" {
|
||||
source = "../../modules/networking/vpc_subnets"
|
||||
|
||||
application = var.application
|
||||
aws-region = var.aws-region
|
||||
customer-name = var.customer-name
|
||||
default-tags = local.default-tags
|
||||
environment = var.environment
|
||||
project = var.project
|
||||
vpc-cidr = var.vpc-cidr
|
||||
number-of-private-subnets-per-az = var.number-of-private-subnets-per-az
|
||||
number-of-public-subnets-per-az = var.number-of-public-subnets-per-az
|
||||
create-nat-gateway = false
|
||||
enable-flow-log = true
|
||||
vpcflowlog-retain-days = 90
|
||||
vpcflowlog-cwl-loggroup-key-arn = ""
|
||||
create-free-vpc-endpoints = false
|
||||
}
|
||||
|
||||
# S3 flow log needs to be created separately. it's not supported by vpc_subnets module
|
||||
resource "aws_flow_log" "vpc-log-s3" {
|
||||
log_destination = var.vpc-flowlog-bucket-arn
|
||||
log_destination_type = "s3"
|
||||
traffic_type = "ALL"
|
||||
vpc_id = module.vpc-subnets.vpc_id
|
||||
}
|
||||
|
||||
/*
|
||||
After adc is deployed by terraform, the following tasks need to be performed manually.
|
||||
They cannot be managed by terraform
|
||||
1. Edit security group created for adconnector. SG name is d-???_controllers
|
||||
2. Enable client LDAPS communication
|
||||
3. Setup maintenance notification through SNS
|
||||
4. Enable SSO application. Setting enable_sso in member account results in error. alias is deliberately not set
|
||||
*/
|
||||
|
||||
module "adconnector" {
|
||||
source = "../../modules/security_identity_compliance/ds-adconnector"
|
||||
|
||||
adc-dns-ips = var.adc-dns-ips
|
||||
adc-domainname = var.adc-domainname
|
||||
adc-service-account-password = var.adc-service-account-password
|
||||
adc-service-account-username = var.adc-service-account-username
|
||||
adc-size = var.adc-size
|
||||
adc-subnet-ids = module.vpc-subnets.private-subnet-ids
|
||||
adc-vpc-id = module.vpc-subnets.vpc_id
|
||||
default-tags = local.default-tags
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
output "directory-id" {
|
||||
value = module.adconnector.directory-id
|
||||
}
|
||||
|
||||
output "security-group-id" {
|
||||
value = module.adconnector.security-group-id
|
||||
}
|
||||
|
||||
output "customer-dns-ip" {
|
||||
value = module.adconnector.customer-dns-ip
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
provider "aws" {
|
||||
region = var.aws-region
|
||||
}
|
||||
|
||||
terraform {
|
||||
required_version = ">= 1.0"
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = ">= 3.25"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
aws-region = "ap-east-1"
|
||||
aws-region-short = "ape1"
|
||||
customer-name = "acme"
|
||||
environment = "preview"
|
||||
project = "sso"
|
||||
application = "sso"
|
||||
vpc-cidr = "10.37.54.0/24"
|
||||
number-of-public-subnets-per-az = 0
|
||||
number-of-private-subnets-per-az = 1
|
||||
vpc-flowlog-bucket-arn = "arn:aws:s3:::prd-vpc-flow-logs-894849410890"
|
||||
adc-domainname = "acme.com"
|
||||
adc-size = "Large"
|
||||
adc-dns-ips = ["10.135.72.66", "10.135.72.67"]
|
||||
adc-service-account-username = "AWSSSOPRD"
|
||||
adc-enable-sso = true
|
||||
@@ -0,0 +1,22 @@
|
||||
variable "aws-region" {}
|
||||
variable "aws-region-short" {}
|
||||
variable "customer-name" {}
|
||||
variable "environment" {}
|
||||
variable "project" {}
|
||||
variable "application" {}
|
||||
variable "vpc-cidr" {}
|
||||
variable "number-of-private-subnets-per-az" {}
|
||||
variable "number-of-public-subnets-per-az" {}
|
||||
variable vpc-flowlog-bucket-arn {}
|
||||
variable "adc-domainname" {}
|
||||
variable "adc-size" {}
|
||||
variable "adc-dns-ips" {}
|
||||
variable "adc-service-account-username" {}
|
||||
variable "adc-service-account-password" {
|
||||
type = string
|
||||
sensitive = true
|
||||
description = "Please supply ad svc account with environment variable (i.e. export TG_VAR_adc-service-account-password=xxx"
|
||||
default = ""
|
||||
}
|
||||
variable "adc-enable-sso" {}
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
data "aws_caller_identity" "this" {}
|
||||
|
||||
locals {
|
||||
default-tags = merge({
|
||||
ServiceProvider = "None"
|
||||
Environment = var.environment
|
||||
Project = var.project
|
||||
Application = var.application
|
||||
TerraformMode = "managed"
|
||||
TerraformDir = trimprefix(path.cwd, "/my/work/xpk-git/")
|
||||
CreatedBy = data.aws_caller_identity.this.arn
|
||||
BuildDate = formatdate("YYYYMMDD", timestamp())
|
||||
})
|
||||
resource-prefix = "${var.environment}-${var.aws-region-short}-${var.customer-name}-${var.project}"
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
module sso {
|
||||
source = "../../modules/security_identity_compliance/sso-permissionsets"
|
||||
|
||||
for_each = { for item in local.items : item.name => item }
|
||||
|
||||
default-tags = local.default-tags
|
||||
pset-name = each.value.name
|
||||
pset-desc = each.value.desc
|
||||
pset-managed-policy-arn = each.value.mpolicy
|
||||
pset-session-duration = each.value.session
|
||||
|
||||
}
|
||||
|
||||
locals {
|
||||
csv_data = <<-CSV
|
||||
name,desc,mpolicy,session
|
||||
ViewOnly,View only access,arn:aws:iam::aws:policy/job-function/ViewOnlyAccess,PT4H
|
||||
ReadOnly,Read only access,arn:aws:iam::aws:policy/ReadOnlyAccess,PT4H
|
||||
FullAccess,Full admin access,arn:aws:iam::aws:policy/AdministratorAccess,PT4H
|
||||
NetworkAdmin,Network admin access,arn:aws:iam::aws:policy/job-function/NetworkAdministrator,PT4H
|
||||
DatabaseAdmin,Database admin access,arn:aws:iam::aws:policy/job-function/DatabaseAdministrator,PT4H
|
||||
BillingAdmin,Billing admin access,arn:aws:iam::aws:policy/job-function/Billing,PT4H
|
||||
SecurityAudit,Security admin access,arn:aws:iam::aws:policy/SecurityAudit,PT4H
|
||||
PowerUser,Full access excluding IAM,arn:aws:iam::aws:policy/PowerUserAccess,PT4H
|
||||
CSV
|
||||
|
||||
items = csvdecode(local.csv_data)
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
provider "aws" {
|
||||
region = var.aws-region
|
||||
}
|
||||
|
||||
terraform {
|
||||
required_version = ">= 1.0"
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = ">= 3.25"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
data "aws_ssoadmin_instances" "sso1" {}
|
||||
|
||||
locals {
|
||||
csv_data2 = <<-CSV
|
||||
username,email,lastName,firstName
|
||||
user1,user1@acme.local,Doe,John
|
||||
user2,user2@acme.local,Smith,Jane
|
||||
CSV
|
||||
|
||||
users = csvdecode(local.csv_data2)
|
||||
}
|
||||
|
||||
resource "aws_identitystore_user" "sso-user" {
|
||||
for_each = { for item in local.users : item.username => item }
|
||||
identity_store_id = tolist(data.aws_ssoadmin_instances.sso1.identity_store_ids)[0]
|
||||
display_name = "${each.value.firstName} ${each.value.lastName}"
|
||||
user_name = each.value.username
|
||||
nickname = each.value.username
|
||||
emails {
|
||||
primary = true
|
||||
value = each.value.email
|
||||
}
|
||||
|
||||
name {
|
||||
family_name = each.value.lastName
|
||||
given_name = each.value.firstName
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_identitystore_group" "sso-group" {
|
||||
identity_store_id = tolist(data.aws_ssoadmin_instances.sso1.identity_store_ids)[0]
|
||||
display_name = "Viewers"
|
||||
description = "Users with view permission"
|
||||
}
|
||||
|
||||
resource "aws_identitystore_group_membership" "sso-group-membership" {
|
||||
for_each = aws_identitystore_user.sso-user
|
||||
identity_store_id = tolist(data.aws_ssoadmin_instances.sso1.identity_store_ids)[0]
|
||||
group_id = aws_identitystore_group.sso-group.group_id
|
||||
member_id = each.value.user_id
|
||||
}
|
||||
|
||||
locals {
|
||||
csv_data3 = <<-CSV
|
||||
seq,groupName,permission,accountId
|
||||
1,Viewers,ViewOnly,865184416664
|
||||
2,Viewers,ViewOnly,572802010687
|
||||
CSV
|
||||
|
||||
accounts = csvdecode(local.csv_data3)
|
||||
}
|
||||
|
||||
resource "aws_ssoadmin_account_assignment" "pset-assignment" {
|
||||
for_each = { for item in local.accounts : item.seq => item }
|
||||
|
||||
instance_arn = tolist(data.aws_ssoadmin_instances.sso1.arns)[0]
|
||||
permission_set_arn = module.sso[each.value.permission].pset-arn
|
||||
|
||||
principal_id = aws_identitystore_group.sso-group.group_id
|
||||
principal_type = "GROUP"
|
||||
|
||||
target_id = each.value.accountId
|
||||
target_type = "AWS_ACCOUNT"
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
aws-region = "ap-east-1"
|
||||
aws-region-short = "ape1"
|
||||
customer-name = "acme"
|
||||
environment = "preview"
|
||||
project = "security"
|
||||
application = "sso"
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
variable "aws-region" {}
|
||||
variable "aws-region-short" {}
|
||||
variable "customer-name" {}
|
||||
variable "environment" {}
|
||||
variable "project" {}
|
||||
variable "application" {}
|
||||
@@ -0,0 +1,40 @@
|
||||
module "aws-backup" {
|
||||
source = "../../modules/storage/aws-backup"
|
||||
|
||||
daily-backup-cron = var.daily-backup-cron
|
||||
monthly-backup-cron = var.monthly-backup-cron
|
||||
daily-backup-retention = var.daily-backup-retention
|
||||
monthly-backup-retention = var.monthly-backup-retention
|
||||
service-opt-in = {
|
||||
"Aurora" : {
|
||||
enabled = false
|
||||
}
|
||||
"DynamoDB" : {
|
||||
enabled = true
|
||||
}
|
||||
"EBS" : {
|
||||
enabled = false
|
||||
}
|
||||
"EC2" : {
|
||||
enabled = true
|
||||
}
|
||||
"EFS" : {
|
||||
enabled = true
|
||||
}
|
||||
"FSx" : {
|
||||
enabled = false
|
||||
}
|
||||
"Redshift" : {
|
||||
enabled = true
|
||||
}
|
||||
"RDS" : {
|
||||
enabled = true
|
||||
}
|
||||
"VirtualMachine" : {
|
||||
enabled = false
|
||||
}
|
||||
"S3" : {
|
||||
enabled = false
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
provider "aws" {
|
||||
region = var.aws-region
|
||||
default_tags {
|
||||
tags = {
|
||||
ServiceProvider = "RackspaceTechnology"
|
||||
Environment = var.environment
|
||||
Project = var.project
|
||||
Application = var.application
|
||||
Owner = var.owner
|
||||
TerraformDir = "${reverse(split("/", path.cwd))[1]}/${reverse(split("/", path.cwd))[0]}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
terraform {
|
||||
required_version = ">= 1.3.9"
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = "~> 5.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
aws-region = "ap-east-1"
|
||||
customer-name = "ken2026"
|
||||
environment = "dev"
|
||||
project = "iac"
|
||||
application = "backup"
|
||||
owner = "ken2026"
|
||||
daily-backup-retention = 31
|
||||
daily-backup-cron = "cron(0 20 * * ? *)"
|
||||
monthly-backup-retention = 365
|
||||
monthly-backup-cron = "cron(0 20 1 * ? *)"
|
||||
# cron(Minutes Hours Day-of-month Month Day-of-week Year)
|
||||
@@ -0,0 +1,11 @@
|
||||
variable "aws-region" {}
|
||||
variable "customer-name" {}
|
||||
variable "environment" {}
|
||||
variable "project" {}
|
||||
variable "application" {}
|
||||
variable "owner" {}
|
||||
|
||||
variable "daily-backup-retention" {}
|
||||
variable "daily-backup-cron" {}
|
||||
variable "monthly-backup-retention" {}
|
||||
variable "monthly-backup-cron" {}
|
||||
@@ -0,0 +1,12 @@
|
||||
# Root module for creating baseline resources including:
|
||||
- iam password policy
|
||||
- delete default VPCs in all region
|
||||
- create cloudtrail
|
||||
- enable aws config in all region
|
||||
- enable guardduty
|
||||
- enable securityhub
|
||||
- disable s3 public access
|
||||
- require EBS encryption
|
||||
|
||||
## If AWS organisation is in use
|
||||
If you are using AWS organisation, setup delegated admin for guardduty and securityhub. This allows centralised management.
|
||||
@@ -0,0 +1,51 @@
|
||||
module "iam-baseline" {
|
||||
# iam password policy, baseline roles, access analyzer, cloudhealth role
|
||||
source = "../../modules/security_identity_compliance/roles_iam_resources"
|
||||
|
||||
customer-name = var.customer-name
|
||||
default-tags = local.default-tags
|
||||
create-cloudhealth-resources = false
|
||||
}
|
||||
|
||||
module "cloudtrail" {
|
||||
# Create cloudtrail
|
||||
source = "../../modules/security_identity_compliance/cloudtrail_cwlogs"
|
||||
resource-prefix = local.resource-prefix
|
||||
default-tags = local.default-tags
|
||||
}
|
||||
|
||||
module "delete-default-vpcs" {
|
||||
# delete default VPCs in all regions
|
||||
source = "../../modules/networking/delete-default-vpcs"
|
||||
}
|
||||
|
||||
module "enable-aws-config" {
|
||||
# enable aws config in all regions and setup aggregation
|
||||
source = "../../modules/security_identity_compliance/aws_config"
|
||||
resource-prefix = local.resource-prefix
|
||||
default-tags = local.default-tags
|
||||
}
|
||||
|
||||
module "enable-guardduty" {
|
||||
/* enable guardduty
|
||||
If you are using AWS organisation, GD delegated admin should be configured
|
||||
on the landing zone security account. This allows centralised management.
|
||||
See https://docs.aws.amazon.com/guardduty/latest/ug/guardduty_settingup.html
|
||||
*/
|
||||
source = "../../modules/security_identity_compliance/guardduty"
|
||||
default-tags = local.default-tags
|
||||
}
|
||||
|
||||
module "enable-securityhub" {
|
||||
/* enable security hub
|
||||
If you are using AWS organisation, SH deleted admin should be configured
|
||||
on the landing zone security account. This allows centralised management.
|
||||
https://docs.aws.amazon.com/securityhub/latest/userguide/designate-orgs-admin-account.html
|
||||
*/
|
||||
source = "../../modules/security_identity_compliance/security_hub"
|
||||
}
|
||||
|
||||
module "default-account-settings" {
|
||||
# other default account settings
|
||||
source = "../../modules/security_identity_compliance/other-default-settings"
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
provider "aws" {
|
||||
region = var.aws-region
|
||||
}
|
||||
|
||||
terraform {
|
||||
required_version = "~> 1.2.5"
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = "~> 3.75.2"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
aws-region = "ap-southeast-1"
|
||||
customer-name = "ken2026"
|
||||
environment = "lab"
|
||||
project = "terraform-dev"
|
||||
application = "infra"
|
||||
@@ -0,0 +1,19 @@
|
||||
variable "aws-region" {}
|
||||
variable "customer-name" {}
|
||||
variable "environment" {}
|
||||
variable "project" {}
|
||||
variable "application" {}
|
||||
|
||||
locals {
|
||||
default-tags = {
|
||||
ServiceProvider = "RackspaceTechnology"
|
||||
Environment = var.environment
|
||||
Project = var.project
|
||||
Application = var.application
|
||||
TerraformMode = "managed"
|
||||
TerraformDir = trimprefix(path.cwd, "/my/work/xpk-git/")
|
||||
BuildDate = formatdate("YYYYMMDD", timestamp())
|
||||
}
|
||||
resource-prefix = "${var.environment}-substr(${var.aws-region},0,2)-${var.customer-name}-${var.project}"
|
||||
}
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
module "deployer-ec2" {
|
||||
source = "../../modules/compute/ec2"
|
||||
|
||||
additional_tags = { "Backup" : "None" }
|
||||
# ami-id = "ami-072e4595d41025d94"
|
||||
ami-id = data.aws_ami.ami-lookup.id
|
||||
default-tags = local.default-tags
|
||||
ebs-encrypted = true
|
||||
asso-eip = false
|
||||
instance-name = "rackspace-deployer-ec2-test"
|
||||
instance-type = "t3.micro"
|
||||
key-name = "whk1-ec2-key-555344966285"
|
||||
asso-public-ip = false
|
||||
root-volume-size = 15
|
||||
security-groups = ["sg-03282995027b7a9fc"]
|
||||
subnet-id = "subnet-07e4392828a70b1f9"
|
||||
instance-profile = "TerraformRole"
|
||||
}
|
||||
|
||||
data "aws_ami" "ami-lookup" {
|
||||
most_recent = true
|
||||
|
||||
filter {
|
||||
name = "name"
|
||||
values = ["CIS Amazon Linux 2 Kernel 5.10*"]
|
||||
}
|
||||
|
||||
filter {
|
||||
name = "virtualization-type"
|
||||
values = ["hvm"]
|
||||
}
|
||||
|
||||
filter {
|
||||
name = "architecture"
|
||||
values = ["x86_64"]
|
||||
}
|
||||
|
||||
owners = ["211372476111"] # CIS
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
aws-region = "ap-southeast-1"
|
||||
customer-name = "bea"
|
||||
environment = "dev"
|
||||
project = "iac"
|
||||
application = "terraform"
|
||||
CostCenter = "none"
|
||||
DynamicAddressGroup = ""
|
||||
Owner = "Rackspace"
|
||||
@@ -0,0 +1,25 @@
|
||||
variable "aws-region" {}
|
||||
variable "customer-name" {}
|
||||
variable "environment" {}
|
||||
variable "project" {}
|
||||
variable "application" {}
|
||||
variable "owner" {}
|
||||
variable "costcenter" {}
|
||||
variable "DynamicAddressGroup" {}
|
||||
|
||||
locals {
|
||||
default-tags = {
|
||||
ServiceProvider = "RackspaceTechnology"
|
||||
Environment = var.environment
|
||||
Project = var.project
|
||||
Application = var.application
|
||||
TerraformMode = "managed"
|
||||
BuildDate = formatdate("YYYYMMDD", timestamp())
|
||||
Owner = var.owner
|
||||
CostCenter = var.costcenter
|
||||
DynamicAddressGroup = var.DynamicAddressGroup
|
||||
|
||||
}
|
||||
resource-prefix = "${var.environment}-substr(${var.aws-region},0,2)-${var.customer-name}-${var.project}"
|
||||
}
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
# Post-install steps
|
||||
|
||||
## Create lbc service account
|
||||
kubectl apply -f 1-lbc.yaml
|
||||
|
||||
## Install AWS Load Balancer Controller in EKS
|
||||
helm repo add eks https://aws.github.io/eks-charts
|
||||
helm repo update
|
||||
|
||||
helm install aws-load-balancer-controller eks/aws-load-balancer-controller \
|
||||
-n kube-system \
|
||||
--set clusterName=xpk-eks01-sunbird \
|
||||
--set serviceAccount.create=false \
|
||||
--set serviceAccount.name=aws-load-balancer-controller-sa
|
||||
|
||||
kubectl -n kube-system get deployment aws-load-balancer-controller
|
||||
|
||||
kubectl logs -n kube-system deployment/aws-load-balancer-controller -f
|
||||
|
||||
## Allow web traffic to nodes
|
||||
Port 80 needs to be allowed on eks node's SGs. Then ALB can successfully register targets. This is now done in main.tf.
|
||||
|
||||
## Testing
|
||||
ALB correctly sending traffic to nginx pods!
|
||||
|
||||
```bash
|
||||
curl k8s-default-nginxing-a42064aa7e-1786392641.ap-east-1.elb.amazonaws.com
|
||||
<h1>Web Server nginx-web-f5988bf66-9lghc - Unique ID: </h1><p>Deployed on EKS Wed Feb 11 09:46:41 UTC 2026</p>
|
||||
|
||||
curl k8s-default-nginxing-a42064aa7e-1786392641.ap-east-1.elb.amazonaws.com
|
||||
<h1>Web Server nginx-web-f5988bf66-6ptff - Unique ID: </h1><p>Deployed on EKS Wed Feb 11 09:46:41 UTC 2026</p>
|
||||
|
||||
curl k8s-default-nginxing-a42064aa7e-1786392641.ap-east-1.elb.amazonaws.com
|
||||
<h1>Web Server nginx-web-f5988bf66-tw6rr - Unique ID: </h1><p>Deployed on EKS Wed Feb 11 09:46:45 UTC 2026</p>
|
||||
|
||||
```
|
||||
|
||||
## Notes on IPv6
|
||||
EKS could not be deployed on ipv6-only private subnets. It appears AWS requires at least 2 free IPv4 addresses in the subnet.
|
||||
I tried and the following error was returned.
|
||||
|
||||
```
|
||||
Error: creating EKS Cluster (xpk-eks01-akita): operation error EKS: CreateCluster, https response error StatusCode: 400,
|
||||
RequestID: b25794cc-3220-4393-a435-c92e2f8aafdd, InvalidParameterException: Atleast one subnet in each AZ should have 2 free IPs.
|
||||
Invalid AZs: { [ap-east-1c, ap-east-1b] }, provided subnets: { subnet-02aaf75a3e4700f74, subnet-02071b29e2883d5b1 }
|
||||
```
|
||||
|
||||
## Notes on KMS key
|
||||
I tried using aws-managed key for EKS, but it failed to deploy with an error.
|
||||
|
||||
```hcl
|
||||
encryption_config = {
|
||||
provider_key_arn = "arn:aws:kms:${data.aws_region.this.id}:${data.aws_caller_identity.current.account_id}:alias/aws/secretsmanager"
|
||||
resources = ["secrets"]
|
||||
}
|
||||
```
|
||||
|
||||
```
|
||||
Error: creating EKS Cluster (xpk-eks01-vervet): operation error EKS: CreateCluster, https response error StatusCode: 400, RequestID:
|
||||
0b866e07-352a-439c-9196-f7a671bdd0ee, api error InvalidRequestException: User not authorized to perform kms:CreateGrant operation
|
||||
```
|
||||
|
||||
When I used ```create_kms_key = true```, EKS was created successfully. I can see that the EKS cluster role is explicitly allowed
|
||||
in the key policy.
|
||||
@@ -0,0 +1,146 @@
|
||||
module "BastionRole" {
|
||||
source = "../../modules/security_identity_compliance/iam-role-v2"
|
||||
description = "EKS bastion instance profile"
|
||||
role-name = "BastionInstanceProfile"
|
||||
trusted-entity = "ec2.amazonaws.com"
|
||||
create-instance-profile = true
|
||||
policies = {
|
||||
EksAdmin = {
|
||||
description = "Eks read permissions required for kubectl"
|
||||
policy = jsonencode(
|
||||
{
|
||||
"Statement" : [
|
||||
{
|
||||
"Sid" : "EksRead",
|
||||
"Action" : [
|
||||
"eks:Describe*",
|
||||
"eks:List*"
|
||||
],
|
||||
"Effect" : "Allow",
|
||||
"Resource" : "*"
|
||||
}
|
||||
],
|
||||
"Version" : "2012-10-17"
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "BastionProfilePermissions" {
|
||||
role = module.BastionRole.name
|
||||
policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
|
||||
}
|
||||
|
||||
module "eks-bastion" {
|
||||
depends_on = [module.eks] # essential for initializing kubectl in userdata
|
||||
source = "../../modules/compute/ec2"
|
||||
|
||||
additional-tags = {}
|
||||
ami-id = data.aws_ami.this.id
|
||||
asso-eip = false
|
||||
asso-public-ip = true
|
||||
use-ipv6 = true
|
||||
data-volumes = {}
|
||||
ebs-encrypted = true
|
||||
instance-name = "${var.environment}-eks-bastion-${random_pet.pet.id}"
|
||||
instance-type = "t4g.micro"
|
||||
key-name = aws_key_pair.kp.key_name
|
||||
kms-key-id = ""
|
||||
root-volume-size = "8"
|
||||
# security-groups = [module.bastion-sg.id, module.eks.cluster_primary_security_group_id]
|
||||
security-groups = [module.bastion-sg.id]
|
||||
subnet-id = module.vpc.public_subnets[0]
|
||||
instance-profile = module.BastionRole.profile-name[0]
|
||||
spot-max-price = 0.0116 # t4g.micro
|
||||
user-data = <<EOF
|
||||
#!/bin/bash
|
||||
# eks bastion setup
|
||||
## Install git
|
||||
dnf -y install git
|
||||
|
||||
## Install kubectl
|
||||
curl -LO https://storage.googleapis.com/kubernetes-release/release/`curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt`/bin/linux/arm64/kubectl
|
||||
chmod +x kubectl
|
||||
mv kubectl /usr/local/bin/
|
||||
|
||||
## Install helm
|
||||
cd /tmp
|
||||
wget -O/tmp/helm.tgz https://get.helm.sh/helm-v4.1.1-linux-arm64.tar.gz
|
||||
tar zxf /tmp/helm.tgz
|
||||
mv /tmp/linux-arm64/helm /usr/local/bin/helm
|
||||
chmod +x /usr/local/bin/helm
|
||||
|
||||
## Install eksctl
|
||||
cd /tmp
|
||||
ARCH=arm64
|
||||
PLATFORM=$(uname -s)_$ARCH
|
||||
curl -sLO "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_$PLATFORM.tar.gz"
|
||||
tar zxf eksctl_Linux_arm64.tar.gz
|
||||
mv eksctl /usr/local/bin
|
||||
chmod +x /usr/local/bin/eksctl
|
||||
|
||||
## Create kube config
|
||||
echo Create kube config...
|
||||
/usr/bin/aws eks update-kubeconfig --name ${var.eks_cluster_name}-${random_pet.pet.id}
|
||||
# echo Sleep for 5 minutes and wait for fargate profile to come up
|
||||
# /usr/bin/sleep 300
|
||||
#
|
||||
# ## Grant EKS console access to IAM role: must be executed with cluster creator's identity. cluster role as instance profile won't do it
|
||||
# echo Patching configmap/aws-auth...
|
||||
# ROLE=" - rolearn: arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/rackLE\n username: build\n groups:\n - system:masters"
|
||||
# /usr/local/bin/kubectl --kubeconfig=/root/.kube/config get -n kube-system configmap/aws-auth -o yaml | awk "/mapRoles: \|/{print;print \"$ROLE\";next}1" > /tmp/aws-auth-patch.yml
|
||||
# /usr/local/bin/kubectl --kubeconfig=/root/.kube/config patch configmap/aws-auth -n kube-system --patch "$(cat /tmp/aws-auth-patch.yml)"
|
||||
# /usr/local/bin/kubectl --kubeconfig=/root/.kube/config get -n kube-system configmap/aws-auth -o yaml
|
||||
EOF
|
||||
}
|
||||
|
||||
data "aws_ami" "this" {
|
||||
most_recent = true
|
||||
name_regex = "^al2023-ami-2023.*-kernel-6.1-arm64"
|
||||
owners = ["amazon"]
|
||||
|
||||
filter {
|
||||
name = "virtualization-type"
|
||||
values = ["hvm"]
|
||||
}
|
||||
|
||||
filter {
|
||||
name = "architecture"
|
||||
values = ["arm64"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "tls_private_key" "sshkey" {
|
||||
algorithm = "ED25519"
|
||||
}
|
||||
|
||||
resource "aws_key_pair" "kp" {
|
||||
key_name = "${var.environment}-eks-bastion-${random_pet.pet.id}-key"
|
||||
public_key = tls_private_key.sshkey.public_key_openssh
|
||||
}
|
||||
|
||||
module "bastion-sg" {
|
||||
source = "../../modules/compute/security_group"
|
||||
|
||||
description = "${var.environment}-eks-bastion-${random_pet.pet.id}-sg"
|
||||
egress = {
|
||||
r1 = "-1,-1,-1,0.0.0.0/0,Allow egress"
|
||||
}
|
||||
ingress = {
|
||||
r1 = "tcp,22,22,0.0.0.0/0,ssh"
|
||||
}
|
||||
name = "eks-bastion-${random_pet.pet.id}-sg"
|
||||
vpc-id = module.vpc.vpc_id
|
||||
}
|
||||
|
||||
# my security_group module does not support ipv6_cidr_blocks
|
||||
resource "aws_security_group_rule" "ipv6_egress" {
|
||||
security_group_id = module.bastion-sg.id
|
||||
type = "egress"
|
||||
from_port = -1
|
||||
to_port = -1
|
||||
protocol = "all"
|
||||
ipv6_cidr_blocks = ["::/0"]
|
||||
description = "Allow ipv6 egress"
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: aws-load-balancer-controller-sa
|
||||
namespace: kube-system
|
||||
@@ -0,0 +1,58 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: nginx-web
|
||||
spec:
|
||||
replicas: 10
|
||||
selector:
|
||||
matchLabels:
|
||||
app: nginx-web
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nginx-web
|
||||
annotations:
|
||||
# Require dedicated ENI per pod
|
||||
vpc.cni.amazonaws.com/network-mode: "IPV4"
|
||||
vpc.cni.amazonaws.com/eniMode: "per-pod" # One ENI per pod
|
||||
vpc.cni.amazonaws.com/eniPrefixMode: "GLOBAL" # Prefix mode for efficiency
|
||||
spec:
|
||||
initContainers:
|
||||
- name: unique-index
|
||||
image: busybox:1.35
|
||||
command: ['sh', '-c']
|
||||
args:
|
||||
- |
|
||||
echo "<h1>Web Server $(POD_NAME)</h1><p>Deployed at $(date)</p>" > /usr/share/nginx/html/index.html
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
volumeMounts:
|
||||
- name: nginx-html
|
||||
mountPath: /usr/share/nginx/html
|
||||
containers:
|
||||
- name: nginx
|
||||
image: nginx:1.27-alpine
|
||||
ports:
|
||||
- containerPort: 80
|
||||
volumeMounts:
|
||||
- name: nginx-html
|
||||
mountPath: /usr/share/nginx/html
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: nginx-html
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: nginx-service
|
||||
spec:
|
||||
selector:
|
||||
app: nginx-web
|
||||
ports:
|
||||
- port: 80
|
||||
targetPort: 80
|
||||
type: ClusterIP
|
||||
@@ -0,0 +1,21 @@
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: aln-ingress-nginx-service
|
||||
annotations:
|
||||
alb.ingress.kubernetes.io/scheme: internet-facing
|
||||
alb.ingress.kubernetes.io/ip-address-type: dualstack
|
||||
alb.ingress.kubernetes.io/healthcheck-path: /
|
||||
alb.ingress.kubernetes.io/target-type: ip
|
||||
spec:
|
||||
ingressClassName: alb
|
||||
rules:
|
||||
- http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: nginx-service
|
||||
port:
|
||||
number: 80
|
||||
@@ -0,0 +1,297 @@
|
||||
/**
|
||||
* # eks-ipv6-nginxpod
|
||||
*
|
||||
* ## Features
|
||||
* - Use terraform-aws-eks to deploy eks cluster and a nodegroup using spot instances
|
||||
* - Use Ipv6 for eks cluster
|
||||
* - Dependent VPC and roles are created
|
||||
* - use pod identity for EBS abd loadbalancer controller
|
||||
* - Create a bastion to manage EKS cluster
|
||||
*
|
||||
*
|
||||
*/
|
||||
data "aws_region" "this" {}
|
||||
|
||||
# Eks Vpc on IPv6
|
||||
resource "random_pet" "pet" {
|
||||
length = 1
|
||||
}
|
||||
|
||||
locals {
|
||||
vpc_cidr = "10.18.0.0/16"
|
||||
# ensure there is room for future expansion
|
||||
private_net_start = cidrsubnet(local.vpc_cidr, 2, 1)
|
||||
public_net_start = cidrsubnet(local.vpc_cidr, 2, 2)
|
||||
}
|
||||
|
||||
data "aws_availability_zones" "this" {
|
||||
state = "available"
|
||||
}
|
||||
|
||||
resource "random_shuffle" "Select2Az" {
|
||||
input = data.aws_availability_zones.this.names
|
||||
result_count = 2
|
||||
}
|
||||
|
||||
module "vpc" {
|
||||
source = "terraform-aws-modules/vpc/aws"
|
||||
version = "6.6.0"
|
||||
|
||||
name = "lab-vpc"
|
||||
cidr = local.vpc_cidr
|
||||
|
||||
azs = random_shuffle.Select2Az.result
|
||||
enable_ipv6 = true
|
||||
public_subnet_assign_ipv6_address_on_creation = true
|
||||
private_subnet_assign_ipv6_address_on_creation = true
|
||||
# private_subnet_ipv6_native = true # EKS requires free IPv4 addresses. see README
|
||||
private_subnets = cidrsubnets(local.private_net_start, 4, 4) # EKS requires free IPv4 addresses. see README
|
||||
public_subnets = cidrsubnets(local.public_net_start, 8, 8) # 2 AZ required by eks lbc
|
||||
public_subnet_ipv6_prefixes = [0, 1]
|
||||
private_subnet_ipv6_prefixes = [10, 11]
|
||||
public_subnet_tags = {
|
||||
"kubernetes.io/role/elb" = 1
|
||||
}
|
||||
|
||||
enable_dns_hostnames = true
|
||||
enable_dns_support = true
|
||||
|
||||
# nat gateway and eigw (vpc module creates the dns64 /64 route to NGW)
|
||||
enable_nat_gateway = true # AWS public endpoints do not support IPv6
|
||||
single_nat_gateway = true
|
||||
create_egress_only_igw = true
|
||||
|
||||
enable_flow_log = false
|
||||
create_flow_log_cloudwatch_log_group = false
|
||||
create_flow_log_cloudwatch_iam_role = false
|
||||
manage_default_network_acl = false
|
||||
}
|
||||
|
||||
# EKS resources
|
||||
module "CsiPodIdentity" {
|
||||
source = "../../modules/security_identity_compliance/iam-role-v2"
|
||||
description = "EKSCSIDriverRole"
|
||||
role-name = "AmazonEBSCSIDriverRole"
|
||||
trusted-entity = jsonencode(
|
||||
{
|
||||
"Version" : "2012-10-17",
|
||||
"Statement" : [
|
||||
{
|
||||
"Effect" : "Allow",
|
||||
"Principal" : {
|
||||
"Service" : "pods.eks.amazonaws.com"
|
||||
},
|
||||
"Action" : [
|
||||
"sts:AssumeRole",
|
||||
"sts:TagSession"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
# 2 policies are required for the ebs csi to work
|
||||
resource "aws_iam_role_policy_attachment" "CsiPodIdentity" {
|
||||
for_each = toset([
|
||||
"arn:aws:iam::aws:policy/AmazonEC2ReadOnlyAccess",
|
||||
"arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
|
||||
])
|
||||
role = module.CsiPodIdentity.name
|
||||
policy_arn = each.value
|
||||
}
|
||||
|
||||
locals {
|
||||
userdata = <<EOT
|
||||
MIME-Version: 1.0
|
||||
Content-Type: multipart/mixed; boundary="//"
|
||||
|
||||
--//
|
||||
Content-Type: application/node.eks.aws
|
||||
|
||||
---
|
||||
apiVersion: node.eks.aws/v1alpha1
|
||||
kind: NodeConfig
|
||||
spec:
|
||||
cluster:
|
||||
apiServerEndpoint: ${module.eks.cluster_endpoint}
|
||||
certificateAuthority: ${module.eks.cluster_certificate_authority_data}
|
||||
cidr: ${module.eks.cluster_service_cidr}
|
||||
name: ${module.eks.cluster_name}
|
||||
kubelet:
|
||||
config:
|
||||
maxPods: 110
|
||||
clusterDNS:
|
||||
- ${replace(module.eks.cluster_service_cidr, "/\\/.*/", "a")}
|
||||
|
||||
--//--
|
||||
EOT
|
||||
}
|
||||
|
||||
resource "aws_launch_template" "node_lt" {
|
||||
name = "eks135-node-template"
|
||||
description = "Launch template for eks 1.35"
|
||||
vpc_security_group_ids = [module.eks.node_security_group_id]
|
||||
update_default_version = true
|
||||
|
||||
# Critical: Set hop limit to 2 for pod IMDS access, required for aws lbc
|
||||
metadata_options {
|
||||
http_endpoint = "enabled"
|
||||
http_tokens = "required" # IMDSv2 required
|
||||
http_put_response_hop_limit = 2 # Allows pods to reach IMDS
|
||||
instance_metadata_tags = "enabled"
|
||||
}
|
||||
|
||||
block_device_mappings {
|
||||
device_name = "/dev/xvda"
|
||||
ebs {
|
||||
volume_size = 20
|
||||
volume_type = "gp3"
|
||||
}
|
||||
}
|
||||
# must not specify this # image_id = data.aws_ami.eks_worker.id
|
||||
user_data = base64encode(local.userdata)
|
||||
tag_specifications {
|
||||
resource_type = "instance"
|
||||
tags = {
|
||||
Name = "${module.eks.cluster_name}-worker"
|
||||
}
|
||||
}
|
||||
tag_specifications {
|
||||
resource_type = "volume"
|
||||
tags = {
|
||||
Name = "${module.eks.cluster_name}-worker"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# eks optimized ami
|
||||
# data "aws_ami" "eks_worker" {
|
||||
# name_regex = "amazon-eks-node-al2023-x86_64-standard-1\\.35.*"
|
||||
# owners = ["800184023465"]
|
||||
# most_recent = true
|
||||
# }
|
||||
|
||||
module "eks" {
|
||||
source = "terraform-aws-modules/eks/aws"
|
||||
# version = "20.34.0"
|
||||
create_iam_role = true
|
||||
name = "${var.eks_cluster_name}-${random_pet.pet.id}"
|
||||
kubernetes_version = "1.35"
|
||||
# enabled_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
|
||||
create_security_group = true
|
||||
security_group_additional_rules = {
|
||||
bastion_access = {
|
||||
description = "Allow access from bastion"
|
||||
protocol = "tcp"
|
||||
from_port = 443
|
||||
to_port = 443
|
||||
type = "ingress"
|
||||
source_security_group_id = module.bastion-sg.id
|
||||
}
|
||||
}
|
||||
vpc_id = module.vpc.vpc_id
|
||||
subnet_ids = module.vpc.private_subnets
|
||||
ip_family = "ipv6"
|
||||
create_cni_ipv6_iam_policy = true
|
||||
create_kms_key = true
|
||||
endpoint_private_access = true
|
||||
endpoint_public_access = false
|
||||
enable_irsa = false
|
||||
create_cloudwatch_log_group = false
|
||||
create_node_security_group = true
|
||||
# authentication_mode = "API_AND_CONFIG_MAP" # use access entries and leave this to default
|
||||
upgrade_policy = {
|
||||
support_type = "STANDARD"
|
||||
}
|
||||
|
||||
addons = {
|
||||
coredns = {}
|
||||
eks-pod-identity-agent = {
|
||||
before_compute = true
|
||||
}
|
||||
kube-proxy = {}
|
||||
aws-ebs-csi-driver = {
|
||||
pod_identity_association = [{
|
||||
role_arn = module.CsiPodIdentity.role-arn
|
||||
service_account = "ebs-csi-controller-sa"
|
||||
}]
|
||||
}
|
||||
vpc-cni = {
|
||||
before_compute = true
|
||||
configuration_values = jsonencode({
|
||||
env = {
|
||||
ENABLE_POD_ENI = "true",
|
||||
POD_SECURITY_GROUP_ENFORCING_MODE = "strict",
|
||||
# in prefix mode, ipv6 will have /80 and ipv4 will have /28
|
||||
ENABLE_PREFIX_DELEGATION = "true"
|
||||
},
|
||||
init = {
|
||||
env = {
|
||||
DISABLE_TCP_EARLY_DEMUX = "true"
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
node_iam_role_additional_policies = {
|
||||
SsmManaged = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
|
||||
}
|
||||
|
||||
eks_managed_node_groups = {
|
||||
EksNodeGroup1 = {
|
||||
# required for setting hop limit to 2 for pod IMDS access, required for aws lbc
|
||||
create_launch_template = false
|
||||
use_custom_launch_template = true
|
||||
launch_template_id = aws_launch_template.node_lt.id
|
||||
launch_template_version = aws_launch_template.node_lt.latest_version
|
||||
|
||||
min_size = 2
|
||||
max_size = 2
|
||||
desired_size = 2
|
||||
|
||||
instance_types = ["t3.large"]
|
||||
capacity_type = "SPOT"
|
||||
subnet_ids = module.vpc.private_subnets
|
||||
}
|
||||
}
|
||||
|
||||
access_entries = {
|
||||
ClusterAdminRole = {
|
||||
principal_arn = "arn:aws:iam::040216112220:role/rackLE"
|
||||
policy_associations = {
|
||||
ClusterAdminPolicy = {
|
||||
policy_arn = "arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy"
|
||||
access_scope = {
|
||||
type = "cluster"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
BastionRole = {
|
||||
principal_arn = module.BastionRole.role-arn
|
||||
policy_associations = {
|
||||
ClusterAdminPolicy = {
|
||||
policy_arn = "arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy"
|
||||
access_scope = {
|
||||
type = "cluster"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
# Allow http traffic from ALB to eks node
|
||||
resource "aws_security_group_rule" "eks_node_alb_ingress" {
|
||||
type = "ingress"
|
||||
from_port = 80
|
||||
to_port = 80
|
||||
protocol = "tcp"
|
||||
security_group_id = module.eks.node_security_group_id
|
||||
ipv6_cidr_blocks = [module.vpc.vpc_ipv6_cidr_block]
|
||||
description = "ALB to nginx pods port 80"
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
# # https://github.com/terraform-aws-modules/terraform-aws-eks-pod-identity
|
||||
module "aws_lb_controller_pod_identity" {
|
||||
source = "terraform-aws-modules/eks-pod-identity/aws"
|
||||
|
||||
name = "aws-loadbalancer-controller"
|
||||
attach_aws_lb_controller_policy = true
|
||||
associations = {
|
||||
this = {
|
||||
cluster_name = module.eks.cluster_name
|
||||
namespace = "kube-system"
|
||||
service_account = "aws-load-balancer-controller-sa"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
provider "aws" {
|
||||
region = var.aws-region
|
||||
|
||||
default_tags {
|
||||
tags = {
|
||||
ServiceProvider = "RackspaceTechnology"
|
||||
Environment = var.environment
|
||||
Project = var.project
|
||||
Application = var.application
|
||||
TerraformDir = join("/", reverse(slice(reverse(split("/", path.cwd)), 0, 2)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
terraform {
|
||||
required_version = "~> 1.13.0"
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = "~> 6.0"
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
backend "s3" {
|
||||
bucket = "whk1-bea-sys-ss-prd-tfgen2-state1"
|
||||
key = "terraform_state/LandingZone/master-payer/sso.tfstate"
|
||||
region = "ap-east-1"
|
||||
dynamodb_table = "whk1-bea-sys-ss-prd-tfgen2-lock"
|
||||
encrypt = true
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
data aws_caller_identity current {}
|
||||
@@ -0,0 +1,12 @@
|
||||
variable "aws-region" {}
|
||||
variable "aws-region-short" {}
|
||||
variable "customer-name" {}
|
||||
variable "environment" {}
|
||||
variable "project" {}
|
||||
variable "application" {}
|
||||
variable "eks_master_user_arn" {}
|
||||
|
||||
variable "eks_cluster_name" {
|
||||
type = string
|
||||
default = "xpk-eks01"
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
# eks-managed-nodegroup
|
||||
Create EKS cluster using managed nodegroup. Then performed EKS control plane upgrades.
|
||||
|
||||
## Versions and upgrade notes
|
||||
Based on 1-4 t3.medium worker node with no app pods
|
||||
|
||||
| eks-ver | coredns | kube-proxy | vpc-cni | AMI-version | upgrade notes |
|
||||
|---------|--------------------|---------------------|--------------------|------------------|---------------------------------------------------------------------|
|
||||
| 1.25 | v1.9.3-eksbuild.10 | v1.25.16-eksbuild.1 | v1.15.4-eksbuild.1 | 1.25.15-20231201 | N/A |
|
||||
| 1.26 | v1.9.3-eksbuild.10 | v1.26.11-eksbuild.1 | v1.15.4-eksbuild.1 | 1.26.10-20231201 | from 1.25, set cluster_version = "1.26". nodes are recreated. 23min |
|
||||
| 1.27 | v1.10.1-eksbuild.6 | v1.27.6-eksbuild.2 | v1.15.4-eksbuild.1 | 1.27.7-20231201 | from 1.26, set cluster_version = "1.27". nodes are recreated. 16min |
|
||||
| 1.28 | v1.10.1-eksbuild.6 | v1.28.4-eksbuild.1 | v1.15.4-eksbuild.1 | 1.28.3-20231201 | from 1.27, set cluster_version = "1.28". nodes are recreated. 26min |
|
||||
|
||||
## References
|
||||
https://repost.aws/knowledge-center/eks-plan-upgrade-cluster
|
||||
@@ -0,0 +1,78 @@
|
||||
module "bastion" {
|
||||
source = "terraform-aws-modules/ec2-instance/aws"
|
||||
version = "5.5.0"
|
||||
name = "lab-ken2026-eks-bastion"
|
||||
instance_type = "t3.micro"
|
||||
ami = data.aws_ami.this.id
|
||||
ignore_ami_changes = true
|
||||
subnet_id = var.subnet_ids[0]
|
||||
vpc_security_group_ids = [module.sg.id, module.eks.cluster_primary_security_group_id]
|
||||
create_iam_instance_profile = true
|
||||
iam_role_description = "IAM role for EC2 instance"
|
||||
iam_role_policies = {
|
||||
SSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
|
||||
CloudwatchAgent = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
|
||||
Admin = "arn:aws:iam::aws:policy/AdministratorAccess"
|
||||
}
|
||||
key_name = "kf-key"
|
||||
ebs_optimized = true
|
||||
root_block_device = [
|
||||
{
|
||||
encrypted = true
|
||||
volume_type = "gp3"
|
||||
volume_size = 10
|
||||
},
|
||||
]
|
||||
volume_tags = data.aws_default_tags.this.tags
|
||||
# IMDSv2 requirement
|
||||
metadata_options = {
|
||||
http_endpoint = "enabled"
|
||||
http_tokens = "required"
|
||||
http_put_response_hop_limit = 2
|
||||
}
|
||||
user_data = <<EOF
|
||||
#!/bin/bash
|
||||
curl -LO https://storage.googleapis.com/kubernetes-release/release/`curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt`/bin/linux/amd64/kubectl
|
||||
chmod 755 kubectl
|
||||
mv kubectl /usr/local/bin/
|
||||
EOF
|
||||
}
|
||||
|
||||
module "sg" {
|
||||
source = "../../modules/compute/security_group"
|
||||
description = "Security group for web server"
|
||||
egress = {
|
||||
r1 = "tcp,0,65535,0.0.0.0/0,Allow outbound tcp traffic"
|
||||
r2 = "udp,0,65535,0.0.0.0/0,Allow outbound udp traffic"
|
||||
r3 = "icmp,0,-1,0.0.0.0/0,Allow icmp echo reply"
|
||||
}
|
||||
ingress = {
|
||||
r1 = "icmp,8,-1,0.0.0.0/0,Allow ICMP traffic"
|
||||
}
|
||||
name = "lab-ken2026-eks-bastion-sg"
|
||||
vpc-id = var.vpc_id
|
||||
}
|
||||
|
||||
data "aws_default_tags" "this" {}
|
||||
|
||||
data "aws_ami" "this" {
|
||||
most_recent = true
|
||||
name_regex = "al2023-ami-202.*"
|
||||
|
||||
filter {
|
||||
name = "virtualization-type"
|
||||
values = ["hvm"]
|
||||
}
|
||||
|
||||
filter {
|
||||
name = "root-device-type"
|
||||
values = ["ebs"]
|
||||
}
|
||||
|
||||
filter {
|
||||
name = "architecture"
|
||||
values = ["x86_64"]
|
||||
}
|
||||
|
||||
owners = ["910595266909"] # AWS
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
locals {
|
||||
resource-prefix = "${var.environment}-${var.aws-region-short}-${var.customer-name}-${var.project}"
|
||||
}
|
||||
@@ -0,0 +1,189 @@
|
||||
provider "kubernetes" {
|
||||
host = module.eks.cluster_endpoint
|
||||
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
|
||||
|
||||
exec {
|
||||
api_version = "client.authentication.k8s.io/v1beta1"
|
||||
command = "aws"
|
||||
# This requires the awscli to be installed locally where Terraform is executed
|
||||
args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
|
||||
}
|
||||
}
|
||||
|
||||
module "eks" {
|
||||
source = "terraform-aws-modules/eks/aws"
|
||||
version = "19.21.0"
|
||||
|
||||
cluster_name = "lab-ken2026-eks01"
|
||||
cluster_endpoint_public_access = true
|
||||
cluster_version = "1.27"
|
||||
|
||||
cluster_addons = {
|
||||
coredns = {
|
||||
preserve = true
|
||||
most_recent = true
|
||||
|
||||
timeouts = {
|
||||
create = "25m"
|
||||
delete = "10m"
|
||||
}
|
||||
}
|
||||
kube-proxy = {
|
||||
most_recent = true
|
||||
}
|
||||
vpc-cni = {
|
||||
most_recent = true
|
||||
}
|
||||
}
|
||||
|
||||
create_kms_key = false
|
||||
cluster_encryption_config = {
|
||||
resources = ["secrets"]
|
||||
provider_key_arn = module.kms.key_arn
|
||||
}
|
||||
|
||||
iam_role_additional_policies = {
|
||||
additional = aws_iam_policy.additional.arn
|
||||
}
|
||||
|
||||
vpc_id = var.vpc_id
|
||||
subnet_ids = var.subnet_ids
|
||||
control_plane_subnet_ids = var.control_plane_subnet_ids
|
||||
|
||||
# Extend cluster security group rules
|
||||
cluster_security_group_additional_rules = {
|
||||
ingress_nodes_ephemeral_ports_tcp = {
|
||||
description = "Nodes on ephemeral ports"
|
||||
protocol = "tcp"
|
||||
from_port = 1025
|
||||
to_port = 65535
|
||||
type = "ingress"
|
||||
source_node_security_group = true
|
||||
}
|
||||
# Test: https://github.com/terraform-aws-modules/terraform-aws-eks/pull/2319
|
||||
ingress_source_security_group_id = {
|
||||
description = "Ingress from another computed security group"
|
||||
protocol = "tcp"
|
||||
from_port = 22
|
||||
to_port = 22
|
||||
type = "ingress"
|
||||
source_security_group_id = aws_security_group.additional.id
|
||||
}
|
||||
}
|
||||
|
||||
# requires terraform be ran inside VPC
|
||||
# manage_aws_auth_configmap = true
|
||||
#
|
||||
# aws_auth_roles = [
|
||||
# {
|
||||
# rolearn = module.eks_managed_node_group.iam_role_arn
|
||||
# username = "system:node:{{EC2PrivateDNSName}}"
|
||||
# groups = [
|
||||
# "system:bootstrappers",
|
||||
# "system:nodes",
|
||||
# ]
|
||||
# },
|
||||
# {
|
||||
# rolearn = "arn:aws:iam::040216112220:role/rackLE"
|
||||
# username = "rackLE"
|
||||
# groups = ["system:masters"]
|
||||
# }
|
||||
# ]
|
||||
#
|
||||
# aws_auth_users = [
|
||||
# {
|
||||
# userarn = var.eks_master_user_arn
|
||||
# username = "eksmaster"
|
||||
# groups = ["system:masters"]
|
||||
# }
|
||||
# ]
|
||||
#
|
||||
# aws_auth_accounts = [
|
||||
# data.aws_caller_identity.current.account_id
|
||||
# ]
|
||||
|
||||
}
|
||||
|
||||
module "eks_managed_node_group" {
|
||||
source = "terraform-aws-modules/eks/aws//modules/eks-managed-node-group"
|
||||
version = "19.21.0"
|
||||
|
||||
name = "eks-mng"
|
||||
cluster_name = module.eks.cluster_name
|
||||
cluster_version = module.eks.cluster_version
|
||||
|
||||
subnet_ids = var.subnet_ids
|
||||
cluster_primary_security_group_id = module.eks.cluster_primary_security_group_id
|
||||
vpc_security_group_ids = [
|
||||
module.eks.cluster_security_group_id,
|
||||
aws_security_group.additional.id
|
||||
]
|
||||
|
||||
ami_type = "AL2_x86_64"
|
||||
instance_types = ["t3.medium"]
|
||||
iam_role_additional_policies = {
|
||||
SsmInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
|
||||
}
|
||||
|
||||
# this will get added to what AWS provides
|
||||
bootstrap_extra_args = <<-EOT
|
||||
# extra args added
|
||||
[settings.kernel]
|
||||
lockdown = "integrity"
|
||||
|
||||
[settings.kubernetes.node-labels]
|
||||
"label1" = "foo"
|
||||
"label2" = "bar"
|
||||
EOT
|
||||
|
||||
min_size = 0
|
||||
desired_size = 1
|
||||
max_size = 2
|
||||
}
|
||||
|
||||
|
||||
module "kms" {
|
||||
source = "terraform-aws-modules/kms/aws"
|
||||
version = "~> 1.5"
|
||||
|
||||
aliases = ["eks/${local.resource-prefix}"]
|
||||
description = "${local.resource-prefix} cluster encryption key"
|
||||
enable_default_policy = true
|
||||
key_owners = [data.aws_caller_identity.current.arn]
|
||||
}
|
||||
|
||||
resource "aws_security_group" "additional" {
|
||||
name_prefix = "${local.resource-prefix}-sg"
|
||||
vpc_id = var.vpc_id
|
||||
|
||||
ingress {
|
||||
from_port = 22
|
||||
to_port = 22
|
||||
protocol = "tcp"
|
||||
cidr_blocks = [
|
||||
"10.0.0.0/8",
|
||||
"172.16.0.0/12",
|
||||
"192.168.0.0/16",
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_iam_policy" "additional" {
|
||||
name = "${local.resource-prefix}-policy"
|
||||
|
||||
policy = jsonencode({
|
||||
Version = "2012-10-17"
|
||||
Statement = [
|
||||
{
|
||||
Action = [
|
||||
"ec2:Describe*",
|
||||
]
|
||||
Effect = "Allow"
|
||||
Resource = "*"
|
||||
},
|
||||
]
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
data "aws_caller_identity" "current" {}
|
||||
@@ -0,0 +1,30 @@
|
||||
provider "aws" {
|
||||
region = var.aws-region
|
||||
|
||||
default_tags {
|
||||
tags = {
|
||||
ServiceProvider = "RackspaceTechnology"
|
||||
Environment = var.environment
|
||||
Project = var.project
|
||||
Application = var.application
|
||||
TerraformMode = "managed"
|
||||
TerraformDir = "${reverse(split("/", path.cwd))[1]}/${reverse(split("/", path.cwd))[0]}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
terraform {
|
||||
required_version = ">= 1.3.0"
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = ">= 5.0"
|
||||
}
|
||||
}
|
||||
backend "s3" {
|
||||
bucket = "lab-ken2026-tf-state"
|
||||
key = "experimental/eks-upgrade-test.tfstate"
|
||||
region = "ap-east-1"
|
||||
encrypt = true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
aws-region = "ap-east-1"
|
||||
aws-region-short = "ape1"
|
||||
customer-name = "ken2026"
|
||||
environment = "lab"
|
||||
project = "eks-pub-module-test"
|
||||
application = "terraform"
|
||||
|
||||
vpc_id = "vpc-01a10b033169f89a8"
|
||||
subnet_ids = ["subnet-0927ba1b06ccfe6c5", "subnet-08dec6787782ee087"]
|
||||
control_plane_subnet_ids = ["subnet-0927ba1b06ccfe6c5", "subnet-08dec6787782ee087"]
|
||||
eks_master_user_arn = "arn:aws:iam::040216112220:role/rackLE"
|
||||
@@ -0,0 +1,11 @@
|
||||
variable "aws-region" {}
|
||||
variable "aws-region-short" {}
|
||||
variable "customer-name" {}
|
||||
variable "environment" {}
|
||||
variable "project" {}
|
||||
variable "application" {}
|
||||
|
||||
variable vpc_id {}
|
||||
variable subnet_ids {}
|
||||
variable control_plane_subnet_ids {}
|
||||
variable eks_master_user_arn {}
|
||||
+227
@@ -0,0 +1,227 @@
|
||||
locals {
|
||||
name = "${var.environment}-${var.customer-name}"
|
||||
}
|
||||
|
||||
module "emr" {
|
||||
source = "terraform-aws-modules/emr/aws"
|
||||
version = "1.2.0"
|
||||
|
||||
name = "${local.name}-emr"
|
||||
release_label = "emr-7.0.0"
|
||||
security_configuration_name = aws_emr_security_configuration.security_config.name
|
||||
applications = ["hbase", "phoenix"]
|
||||
auto_termination_policy = {
|
||||
idle_timeout = 3600
|
||||
}
|
||||
|
||||
bootstrap_action = {
|
||||
}
|
||||
|
||||
configurations_json = jsonencode([
|
||||
{
|
||||
Classification : "hbase-env",
|
||||
Configurations : [
|
||||
{
|
||||
"Classification" : "export",
|
||||
"Properties" : {
|
||||
"HBASE_MASTER_OPTS" : "-Xmx4g",
|
||||
"HBASE_REGIONSERVER_OPTS" : "-Xmx8g"
|
||||
}
|
||||
}
|
||||
],
|
||||
Properties : {}
|
||||
},
|
||||
{
|
||||
Classification : "hbase-site",
|
||||
Properties : {
|
||||
"hbase.regionserver.handler.count" : "300"
|
||||
}
|
||||
}
|
||||
])
|
||||
|
||||
master_instance_fleet = {
|
||||
name = "master-fleet"
|
||||
target_on_demand_capacity = 1
|
||||
instance_type_configs = [
|
||||
{
|
||||
instance_type = "c6g.xlarge"
|
||||
ebs_config = {
|
||||
size = 20
|
||||
type = "gp3"
|
||||
volumes_per_instance = 1
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
core_instance_fleet = {
|
||||
name = "core-fleet"
|
||||
target_on_demand_capacity = 0
|
||||
target_spot_capacity = 1
|
||||
instance_type_configs = [
|
||||
{
|
||||
bid_price_as_percentage_of_on_demand_price = 70
|
||||
instance_type = "c6g.xlarge"
|
||||
weighted_capacity = 1
|
||||
ebs_config = {
|
||||
size = 20
|
||||
type = "gp3"
|
||||
volumes_per_instance = 1
|
||||
}
|
||||
},
|
||||
{
|
||||
bid_price_as_percentage_of_on_demand_price = 70
|
||||
instance_type = "m6g.xlarge"
|
||||
weighted_capacity = 1
|
||||
ebs_config = {
|
||||
size = 20
|
||||
type = "gp3"
|
||||
volumes_per_instance = 1
|
||||
}
|
||||
}
|
||||
]
|
||||
launch_specifications = {
|
||||
spot_specification = {
|
||||
allocation_strategy = "capacity-optimized"
|
||||
block_duration_minutes = 0
|
||||
timeout_action = "SWITCH_TO_ON_DEMAND"
|
||||
timeout_duration_minutes = 5
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ebs_root_volume_size = 20
|
||||
# Subnets should be tagged with
|
||||
# { "for-use-with-amazon-emr-managed-policies" = true }
|
||||
ec2_attributes = {
|
||||
subnet_ids = ["subnet-08dec6787782ee087", "subnet-0551e96ffd016192a"]
|
||||
key_name = "kf-key"
|
||||
}
|
||||
vpc_id = "vpc-01a10b033169f89a8"
|
||||
|
||||
# Required for creating public cluster
|
||||
is_private_cluster = false
|
||||
|
||||
keep_job_flow_alive_when_no_steps = true
|
||||
list_steps_states = ["PENDING", "RUNNING", "CANCEL_PENDING", "CANCELLED", "FAILED", "INTERRUPTED", "COMPLETED"]
|
||||
log_uri = "s3n://${module.s3_bucket.s3_bucket_id}/"
|
||||
|
||||
scale_down_behavior = "TERMINATE_AT_TASK_COMPLETION"
|
||||
step_concurrency_level = 3
|
||||
termination_protection = false
|
||||
visible_to_all_users = true
|
||||
service_iam_role_policies = {
|
||||
AmazonEMRServicePolicy_v2 = "arn:aws:iam::aws:policy/service-role/AmazonEMRServicePolicy_v2"
|
||||
PowerUser = "arn:aws:iam::aws:policy/PowerUserAccess"
|
||||
}
|
||||
iam_instance_profile_policies = {
|
||||
AmazonElasticMapReduceforEC2Role = "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role"
|
||||
PowerUser = "arn:aws:iam::aws:policy/PowerUserAccess"
|
||||
}
|
||||
# Use managed scaling policy to refill spot instances
|
||||
managed_scaling_policy = {
|
||||
unit_type = "InstanceFleetUnits"
|
||||
minimum_capacity_units = 1
|
||||
maximum_capacity_units = 4
|
||||
maximum_ondemand_capacity_units = 0
|
||||
maximum_core_capacity_units = 4
|
||||
}
|
||||
}
|
||||
|
||||
resource "random_id" "this" {
|
||||
byte_length = 2
|
||||
}
|
||||
|
||||
module "s3_bucket" {
|
||||
source = "terraform-aws-modules/s3-bucket/aws"
|
||||
version = "~> 3.0"
|
||||
|
||||
bucket = "${local.name}-emrlogs-${random_id.this.dec}"
|
||||
|
||||
# Allow deletion of non-empty bucket
|
||||
# Example usage only - not recommended for production
|
||||
force_destroy = true
|
||||
|
||||
attach_deny_insecure_transport_policy = true
|
||||
attach_require_latest_tls_policy = true
|
||||
|
||||
block_public_acls = true
|
||||
block_public_policy = true
|
||||
ignore_public_acls = true
|
||||
restrict_public_buckets = true
|
||||
|
||||
server_side_encryption_configuration = {
|
||||
rule = {
|
||||
apply_server_side_encryption_by_default = {
|
||||
sse_algorithm = "AES256"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_kms_key" "ebs" {
|
||||
description = "KMS key for EBS volumes"
|
||||
deletion_window_in_days = 7
|
||||
}
|
||||
|
||||
resource "aws_emr_security_configuration" "security_config" {
|
||||
name = "${local.name}-emr-security-config"
|
||||
|
||||
configuration = jsonencode(
|
||||
{
|
||||
EncryptionConfiguration = {
|
||||
AtRestEncryptionConfiguration = {
|
||||
LocalDiskEncryptionConfiguration = {
|
||||
AwsKmsKey = aws_kms_key.ebs.arn
|
||||
EnableEbsEncryption = true
|
||||
EncryptionKeyProviderType = "AwsKms"
|
||||
}
|
||||
S3EncryptionConfiguration = {
|
||||
EncryptionMode = "SSE-S3"
|
||||
}
|
||||
}
|
||||
EnableAtRestEncryption = true
|
||||
EnableInTransitEncryption = false
|
||||
}
|
||||
InstanceMetadataServiceConfiguration = {
|
||||
HttpPutResponseHopLimit = 1
|
||||
MinimumInstanceMetadataServiceVersion = 2
|
||||
}
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
# Tag EMR master and core instances
|
||||
# Need to run this layer twice to set instance tags
|
||||
# Adding depends_on will results in dependency loop
|
||||
data "aws_instances" "master_instances" {
|
||||
# depends_on = [module.emr]
|
||||
instance_tags = {
|
||||
"aws:elasticmapreduce:instance-group-role" = "MASTER"
|
||||
}
|
||||
instance_state_names = ["running"]
|
||||
}
|
||||
|
||||
data "aws_instances" "core_instances" {
|
||||
# depends_on = [module.emr]
|
||||
instance_tags = {
|
||||
"aws:elasticmapreduce:instance-group-role" = "CORE"
|
||||
}
|
||||
instance_state_names = ["running"]
|
||||
}
|
||||
|
||||
resource "aws_ec2_tag" "tag-emr-core-instances" {
|
||||
# depends_on = [data.aws_instances.core_instances]
|
||||
count = length(data.aws_instances.core_instances.ids)
|
||||
resource_id = sort(data.aws_instances.core_instances.ids)[count.index]
|
||||
key = "Name"
|
||||
value = "${local.name}-emr-core-${count.index + 1}"
|
||||
}
|
||||
|
||||
resource "aws_ec2_tag" "tag-emr-master-instances" {
|
||||
# depends_on = [data.aws_instances.master_instances]
|
||||
count = length(data.aws_instances.master_instances.ids)
|
||||
resource_id = sort(data.aws_instances.master_instances.ids)[count.index]
|
||||
key = "Name"
|
||||
value = "${local.name}-emr-master-${count.index + 1}"
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
output "core_instance_ids" {
|
||||
value = data.aws_instances.core_instances.ids
|
||||
}
|
||||
|
||||
output "master_instance_ids" {
|
||||
value = data.aws_instances.master_instances.ids
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
provider "aws" {
|
||||
region = var.aws-region
|
||||
default_tags {
|
||||
tags = {
|
||||
Environment = var.environment
|
||||
Project = var.project
|
||||
Application = var.application
|
||||
TerraformMode = "managed"
|
||||
TerraformDir = "${reverse(split("/", path.cwd))[1]}/${reverse(split("/", path.cwd))[0]}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
terraform {
|
||||
required_version = ">= 1.3.0"
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = "~> 5.0.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
aws-region = "ap-east-1"
|
||||
# aws-region-short = "ape1"
|
||||
customer-name = "ken2026"
|
||||
environment = "lab"
|
||||
project = "iac"
|
||||
application = "emr"
|
||||
@@ -0,0 +1,9 @@
|
||||
variable "aws-region" {}
|
||||
# variable "aws-region-short" {}
|
||||
variable "customer-name" {}
|
||||
variable "environment" {}
|
||||
variable "project" {}
|
||||
variable "application" {}
|
||||
locals {
|
||||
resource-prefix = "${var.environment}-${substr(var.aws-region, 0, 2)}-${var.customer-name}-${var.project}"
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
/*
|
||||
Note that attribute of ephemeral resources can only be accessed by write-only parameters
|
||||
such as secret_string_wo
|
||||
*/
|
||||
|
||||
ephemeral "random_password" "example" {
|
||||
length = 16
|
||||
special = true
|
||||
}
|
||||
|
||||
resource "aws_secretsmanager_secret" "example" {
|
||||
name = "example-secret"
|
||||
description = "example secret created from ephemeral resource"
|
||||
}
|
||||
|
||||
resource "aws_secretsmanager_secret_version" "example" {
|
||||
secret_id = aws_secretsmanager_secret.example.id
|
||||
secret_string_wo = ephemeral.random_password.example.result
|
||||
secret_string_wo_version = 1
|
||||
}
|
||||
|
||||
ephemeral "aws_secretsmanager_secret_version" "example" {
|
||||
secret_id = aws_secretsmanager_secret_version.example.secret_id
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
terraform {
|
||||
required_version = ">= 1.3.0"
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = ">= 5.0.0"
|
||||
}
|
||||
random = {
|
||||
source = "hashicorp/random"
|
||||
version = ">= 3.7.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
Executable
+3
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
RESULTS=$(aws rds describe-db-instances --query 'DBInstances[*].DBInstanceIdentifier' --output text | xargs)
|
||||
jq -n --arg result "$RESULTS" '{"result":$result}'
|
||||
@@ -0,0 +1,7 @@
|
||||
data external rds-instances {
|
||||
program = ["bash", "./list-rds-instances.sh"]
|
||||
}
|
||||
|
||||
output rds-instances {
|
||||
value = split(" ", data.external.rds-instances.result.result)
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
module "iam-group" {
|
||||
source = "../../modules/security_identity_compliance/iam-group"
|
||||
|
||||
iam-group-name = "ViewOnlyUsers001"
|
||||
iam-group-policy = ""
|
||||
iam-group-policy-name = ""
|
||||
managed-policy-arns = ["arn:aws:iam::aws:policy/job-function/ViewOnlyAccess"]
|
||||
}
|
||||
|
||||
module "iam-group2" {
|
||||
source = "../../modules/security_identity_compliance/iam-group"
|
||||
|
||||
iam-group-name = "ViewOnlyAndS3Admin001"
|
||||
iam-group-policy = data.aws_iam_policy_document.user-policy.json
|
||||
iam-group-policy-name = "S3AdminPermissions"
|
||||
managed-policy-arns = ["arn:aws:iam::aws:policy/job-function/ViewOnlyAccess"]
|
||||
}
|
||||
|
||||
module "iam-user1" {
|
||||
source = "../../modules/security_identity_compliance/iam-user"
|
||||
|
||||
iam-user-name = "JohnNotInGroup"
|
||||
create-access-key = true
|
||||
create-password = true
|
||||
managed-policy-arns = ["arn:aws:iam::aws:policy/job-function/ViewOnlyAccess"]
|
||||
}
|
||||
|
||||
module "iam-user2" {
|
||||
source = "../../modules/security_identity_compliance/iam-user"
|
||||
|
||||
iam-user-name = "PeterInGroup"
|
||||
iam-user-policy = data.aws_iam_policy_document.user-policy.json
|
||||
iam-user-policy-name = "S3AdminPermissions"
|
||||
create-access-key = false
|
||||
create-password = false
|
||||
managed-policy-arns = ["arn:aws:iam::aws:policy/job-function/ViewOnlyAccess"]
|
||||
add-to-groups = [module.iam-group.iam-group-name]
|
||||
}
|
||||
|
||||
data "aws_iam_policy_document" "user-policy" {
|
||||
statement {
|
||||
sid = "s3admin"
|
||||
|
||||
actions = [
|
||||
"s3:*"
|
||||
]
|
||||
|
||||
effect = "Allow"
|
||||
resources = ["*"]
|
||||
}
|
||||
}
|
||||
|
||||
output "iam-user1-arn" {
|
||||
value = module.iam-user1.iam-user-arn
|
||||
}
|
||||
|
||||
output "iam-user2-arn" {
|
||||
value = module.iam-user2.iam-user-arn
|
||||
}
|
||||
|
||||
output "iam-user1-access-key" {
|
||||
value = module.iam-user1.iam-user-access-key
|
||||
}
|
||||
|
||||
output iam-user1-secret-location {
|
||||
value = module.iam-user1.iam-user-secret-arn
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
aws-region = "ap-southeast-1"
|
||||
customer-name = "ken2026"
|
||||
environment = "dev"
|
||||
project = "iac"
|
||||
application = "terraform"
|
||||
costcenter = "none"
|
||||
DynamicAddressGroup = ""
|
||||
owner = "Rackspace"
|
||||
@@ -0,0 +1,21 @@
|
||||
variable "aws-region" {}
|
||||
variable "customer-name" {}
|
||||
variable "environment" {}
|
||||
variable "project" {}
|
||||
variable "application" {}
|
||||
variable "owner" {}
|
||||
variable "costcenter" {}
|
||||
variable "DynamicAddressGroup" {}
|
||||
|
||||
locals {
|
||||
default-tags = {
|
||||
ServiceProvider = "RackspaceTechnology"
|
||||
Environment = var.environment
|
||||
Project = var.project
|
||||
Application = var.application
|
||||
TerraformMode = "managed"
|
||||
Owner = var.owner
|
||||
TerraformDir = join("/", reverse(slice(reverse(split("/", path.cwd)), 0, 2)))
|
||||
}
|
||||
resource-prefix = "${var.environment}-substr(${var.aws-region},0,2)-${var.customer-name}-${var.project}"
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
<!-- This readme file is generated with terraform-docs -->
|
||||
## Prepare lambda-layer1 with the following command.
|
||||
The path is hard-required by AWS. See https://docs.aws.amazon.com/lambda/latest/dg/packaging-layers.html
|
||||
|
||||
```bash
|
||||
pip install requests -t python/lib/python3.12/site-packages/
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
| Name | Version |
|
||||
|------|---------|
|
||||
| terraform | >= 1.3.0 |
|
||||
| aws | >= 4.40 |
|
||||
|
||||
## Providers
|
||||
|
||||
| Name | Version |
|
||||
|------|---------|
|
||||
| archive | 2.5.0 |
|
||||
| aws | 5.64.0 |
|
||||
|
||||
## Modules
|
||||
|
||||
No modules.
|
||||
|
||||
## Resources
|
||||
|
||||
| Name | Type |
|
||||
|------|------|
|
||||
| [aws_iam_role.lambda-role1](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
|
||||
| [aws_lambda_function.myFunction](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource |
|
||||
| [aws_lambda_layer_version.libraries](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version) | resource |
|
||||
| [archive_file.function1](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source |
|
||||
| [archive_file.layer1](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source |
|
||||
|
||||
## Inputs
|
||||
|
||||
No inputs.
|
||||
|
||||
## Outputs
|
||||
|
||||
No outputs.
|
||||
|
||||
---
|
||||
## Authorship
|
||||
This module was developed by xpk.
|
||||
@@ -0,0 +1,10 @@
|
||||
# reference: https://aws.amazon.com/premiumsupport/knowledge-center/start-stop-lambda-eventbridge/
|
||||
import requests
|
||||
|
||||
def lambda_handler(event, context):
|
||||
r = requests.get('https://ipinfo.io/')
|
||||
return {
|
||||
"HttpResponseCode": r.status_code
|
||||
}
|
||||
|
||||
|
||||
Binary file not shown.
Binary file not shown.
+8
@@ -0,0 +1,8 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from charset_normalizer.cli import cli_detect
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli_detect())
|
||||
+1
@@ -0,0 +1 @@
|
||||
pip
|
||||
+20
@@ -0,0 +1,20 @@
|
||||
This package contains a modified version of ca-bundle.crt:
|
||||
|
||||
ca-bundle.crt -- Bundle of CA Root Certificates
|
||||
|
||||
This is a bundle of X.509 certificates of public Certificate Authorities
|
||||
(CA). These were automatically extracted from Mozilla's root certificates
|
||||
file (certdata.txt). This file can be found in the mozilla source tree:
|
||||
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
|
||||
It contains the certificates in PEM format and therefore
|
||||
can be directly used with curl / libcurl / php_curl, or with
|
||||
an Apache+mod_ssl webserver for SSL client authentication.
|
||||
Just configure this file as the SSLCACertificateFile.#
|
||||
|
||||
***** BEGIN LICENSE BLOCK *****
|
||||
This Source Code Form is subject to the terms of the Mozilla Public License,
|
||||
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
|
||||
one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
***** END LICENSE BLOCK *****
|
||||
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
|
||||
+67
@@ -0,0 +1,67 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: certifi
|
||||
Version: 2024.7.4
|
||||
Summary: Python package for providing Mozilla's CA Bundle.
|
||||
Home-page: https://github.com/certifi/python-certifi
|
||||
Author: Kenneth Reitz
|
||||
Author-email: me@kennethreitz.com
|
||||
License: MPL-2.0
|
||||
Project-URL: Source, https://github.com/certifi/python-certifi
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
||||
Classifier: Natural Language :: English
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Requires-Python: >=3.6
|
||||
License-File: LICENSE
|
||||
|
||||
Certifi: Python SSL Certificates
|
||||
================================
|
||||
|
||||
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
||||
validating the trustworthiness of SSL certificates while verifying the identity
|
||||
of TLS hosts. It has been extracted from the `Requests`_ project.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
||||
|
||||
$ pip install certifi
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
To reference the installed certificate authority (CA) bundle, you can use the
|
||||
built-in function::
|
||||
|
||||
>>> import certifi
|
||||
|
||||
>>> certifi.where()
|
||||
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
||||
|
||||
Or from the command line::
|
||||
|
||||
$ python -m certifi
|
||||
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
||||
|
||||
Enjoy!
|
||||
|
||||
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
||||
|
||||
Addition/Removal of Certificates
|
||||
--------------------------------
|
||||
|
||||
Certifi does not support any addition/removal or other modification of the
|
||||
CA trust store content. This project is intended to provide a reliable and
|
||||
highly portable root of trust to python deployments. Look to upstream projects
|
||||
for methods to use alternate trust.
|
||||
+14
@@ -0,0 +1,14 @@
|
||||
certifi-2024.7.4.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
certifi-2024.7.4.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
|
||||
certifi-2024.7.4.dist-info/METADATA,sha256=L9_EuPoQQvHFzxu03_ctaEZxhEty7inz569jGWjlLGo,2221
|
||||
certifi-2024.7.4.dist-info/RECORD,,
|
||||
certifi-2024.7.4.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
|
||||
certifi-2024.7.4.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
||||
certifi/__init__.py,sha256=LHXz7E80YJYBzCBv6ZyidQ5-ciYSkSebpY2E5OM0l7o,94
|
||||
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
||||
certifi/__pycache__/__init__.cpython-312.pyc,,
|
||||
certifi/__pycache__/__main__.cpython-312.pyc,,
|
||||
certifi/__pycache__/core.cpython-312.pyc,,
|
||||
certifi/cacert.pem,sha256=SIupYGAr8HzGP073rsEIaS_sQYIPwzKKjj894DgUmu4,291528
|
||||
certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
|
||||
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: setuptools (70.2.0)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
+1
@@ -0,0 +1 @@
|
||||
certifi
|
||||
@@ -0,0 +1,4 @@
|
||||
from .core import contents, where
|
||||
|
||||
__all__ = ["contents", "where"]
|
||||
__version__ = "2024.07.04"
|
||||
@@ -0,0 +1,12 @@
|
||||
import argparse
|
||||
|
||||
from certifi import contents, where
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-c", "--contents", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.contents:
|
||||
print(contents())
|
||||
else:
|
||||
print(where())
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
certifi.py
|
||||
~~~~~~~~~~
|
||||
|
||||
This module returns the installation location of cacert.pem or its contents.
|
||||
"""
|
||||
import sys
|
||||
import atexit
|
||||
|
||||
def exit_cacert_ctx() -> None:
|
||||
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
|
||||
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
|
||||
from importlib.resources import as_file, files
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the file
|
||||
# in cases where we're inside of a zipimport situation until someone
|
||||
# actually calls where(), but we don't want to re-extract the file
|
||||
# on every call of where(), so we'll do it once then store it in a
|
||||
# global variable.
|
||||
global _CACERT_CTX
|
||||
global _CACERT_PATH
|
||||
if _CACERT_PATH is None:
|
||||
# This is slightly janky, the importlib.resources API wants you to
|
||||
# manage the cleanup of this file, so it doesn't actually return a
|
||||
# path, it returns a context manager that will give you the path
|
||||
# when you enter it and will do any cleanup when you leave it. In
|
||||
# the common case of not needing a temporary file, it will just
|
||||
# return the file system location and the __exit__() is a no-op.
|
||||
#
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
atexit.register(exit_cacert_ctx)
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
def contents() -> str:
|
||||
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
||||
|
||||
elif sys.version_info >= (3, 7):
|
||||
|
||||
from importlib.resources import path as get_path, read_text
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the
|
||||
# file in cases where we're inside of a zipimport situation until
|
||||
# someone actually calls where(), but we don't want to re-extract
|
||||
# the file on every call of where(), so we'll do it once then store
|
||||
# it in a global variable.
|
||||
global _CACERT_CTX
|
||||
global _CACERT_PATH
|
||||
if _CACERT_PATH is None:
|
||||
# This is slightly janky, the importlib.resources API wants you
|
||||
# to manage the cleanup of this file, so it doesn't actually
|
||||
# return a path, it returns a context manager that will give
|
||||
# you the path when you enter it and will do any cleanup when
|
||||
# you leave it. In the common case of not needing a temporary
|
||||
# file, it will just return the file system location and the
|
||||
# __exit__() is a no-op.
|
||||
#
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
atexit.register(exit_cacert_ctx)
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
def contents() -> str:
|
||||
return read_text("certifi", "cacert.pem", encoding="ascii")
|
||||
|
||||
else:
|
||||
import os
|
||||
import types
|
||||
from typing import Union
|
||||
|
||||
Package = Union[types.ModuleType, str]
|
||||
Resource = Union[str, "os.PathLike"]
|
||||
|
||||
# This fallback will work for Python versions prior to 3.7 that lack the
|
||||
# importlib.resources module but relies on the existing `where` function
|
||||
# so won't address issues with environments like PyOxidizer that don't set
|
||||
# __file__ on modules.
|
||||
def read_text(
|
||||
package: Package,
|
||||
resource: Resource,
|
||||
encoding: str = 'utf-8',
|
||||
errors: str = 'strict'
|
||||
) -> str:
|
||||
with open(where(), encoding=encoding) as data:
|
||||
return data.read()
|
||||
|
||||
# If we don't have importlib.resources, then we will just do the old logic
|
||||
# of assuming we're on the filesystem and munge the path directly.
|
||||
def where() -> str:
|
||||
f = os.path.dirname(__file__)
|
||||
|
||||
return os.path.join(f, "cacert.pem")
|
||||
|
||||
def contents() -> str:
|
||||
return read_text("certifi", "cacert.pem", encoding="ascii")
|
||||
+1
@@ -0,0 +1 @@
|
||||
pip
|
||||
+21
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2019 TAHRI Ahmed R.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
+683
@@ -0,0 +1,683 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: charset-normalizer
|
||||
Version: 3.3.2
|
||||
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
||||
Home-page: https://github.com/Ousret/charset_normalizer
|
||||
Author: Ahmed TAHRI
|
||||
Author-email: ahmed.tahri@cloudnursery.dev
|
||||
License: MIT
|
||||
Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
|
||||
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
|
||||
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Text Processing :: Linguistic
|
||||
Classifier: Topic :: Utilities
|
||||
Classifier: Typing :: Typed
|
||||
Requires-Python: >=3.7.0
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Provides-Extra: unicode_backport
|
||||
|
||||
<h1 align="center">Charset Detection, for Everyone 👋</h1>
|
||||
|
||||
<p align="center">
|
||||
<sup>The Real First Universal Charset Detector</sup><br>
|
||||
<a href="https://pypi.org/project/charset-normalizer">
|
||||
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
||||
</a>
|
||||
<a href="https://pepy.tech/project/charset-normalizer/">
|
||||
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
|
||||
</a>
|
||||
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
|
||||
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
|
||||
</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<sup><i>Featured Packages</i></sup><br>
|
||||
<a href="https://github.com/jawah/niquests">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-HTTP_1.1%2C%202%2C_and_3_Client-cyan">
|
||||
</a>
|
||||
<a href="https://github.com/jawah/wassima">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
|
||||
</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
|
||||
<a href="https://github.com/nickspring/charset-normalizer-rs">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
||||
> I'm trying to resolve the issue by taking a new approach.
|
||||
> All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
<p align="center">
|
||||
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
||||
</p>
|
||||
|
||||
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
||||
|
||||
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
||||
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
|
||||
| `Fast` | ❌ | ✅ | ✅ |
|
||||
| `Universal**` | ❌ | ✅ | ❌ |
|
||||
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
||||
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
||||
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
||||
| `Native Python` | ✅ | ✅ | ❌ |
|
||||
| `Detect spoken language` | ❌ | ✅ | N/A |
|
||||
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
||||
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
|
||||
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
|
||||
|
||||
<p align="center">
|
||||
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
||||
</p>
|
||||
|
||||
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
||||
Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
|
||||
|
||||
## ⚡ Performance
|
||||
|
||||
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
||||
|
||||
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
||||
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
|
||||
| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
|
||||
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
|
||||
|
||||
| Package | 99th percentile | 95th percentile | 50th percentile |
|
||||
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
|
||||
| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
|
||||
| charset-normalizer | 100 ms | 50 ms | 5 ms |
|
||||
|
||||
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
||||
|
||||
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
||||
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
||||
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
||||
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
||||
> (eg. Supported Encoding) Challenge-them if you want.
|
||||
|
||||
## ✨ Installation
|
||||
|
||||
Using pip:
|
||||
|
||||
```sh
|
||||
pip install charset-normalizer -U
|
||||
```
|
||||
|
||||
## 🚀 Basic Usage
|
||||
|
||||
### CLI
|
||||
This package comes with a CLI.
|
||||
|
||||
```
|
||||
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
||||
file [file ...]
|
||||
|
||||
The Real First Universal Charset Detector. Discover originating encoding used
|
||||
on text file. Normalize text to unicode.
|
||||
|
||||
positional arguments:
|
||||
files File(s) to be analysed
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v, --verbose Display complementary information about file if any.
|
||||
Stdout will contain logs about the detection process.
|
||||
-a, --with-alternative
|
||||
Output complementary possibilities if any. Top-level
|
||||
JSON WILL be a list.
|
||||
-n, --normalize Permit to normalize input file. If not set, program
|
||||
does not write anything.
|
||||
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
||||
JSON output.
|
||||
-r, --replace Replace file when trying to normalize it instead of
|
||||
creating a new one.
|
||||
-f, --force Replace file without asking if you are sure, use this
|
||||
flag with caution.
|
||||
-t THRESHOLD, --threshold THRESHOLD
|
||||
Define a custom maximum amount of chaos allowed in
|
||||
decoded content. 0. <= chaos <= 1.
|
||||
--version Show version information and exit.
|
||||
```
|
||||
|
||||
```bash
|
||||
normalizer ./data/sample.1.fr.srt
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
python -m charset_normalizer ./data/sample.1.fr.srt
|
||||
```
|
||||
|
||||
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
||||
|
||||
```json
|
||||
{
|
||||
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
||||
"encoding": "cp1252",
|
||||
"encoding_aliases": [
|
||||
"1252",
|
||||
"windows_1252"
|
||||
],
|
||||
"alternative_encodings": [
|
||||
"cp1254",
|
||||
"cp1256",
|
||||
"cp1258",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
"mbcs"
|
||||
],
|
||||
"language": "French",
|
||||
"alphabets": [
|
||||
"Basic Latin",
|
||||
"Latin-1 Supplement"
|
||||
],
|
||||
"has_sig_or_bom": false,
|
||||
"chaos": 0.149,
|
||||
"coherence": 97.152,
|
||||
"unicode_path": null,
|
||||
"is_preferred": true
|
||||
}
|
||||
```
|
||||
|
||||
### Python
|
||||
*Just print out normalized text*
|
||||
```python
|
||||
from charset_normalizer import from_path
|
||||
|
||||
results = from_path('./my_subtitle.srt')
|
||||
|
||||
print(str(results.best()))
|
||||
```
|
||||
|
||||
*Upgrade your code without effort*
|
||||
```python
|
||||
from charset_normalizer import detect
|
||||
```
|
||||
|
||||
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
||||
|
||||
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
||||
|
||||
## 😇 Why
|
||||
|
||||
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
||||
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
||||
|
||||
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
||||
produce **two identical rendered string.**
|
||||
What I want is to get readable text, the best I can.
|
||||
|
||||
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
||||
|
||||
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
||||
|
||||
## 🍰 How
|
||||
|
||||
- Discard all charset encoding table that could not fit the binary content.
|
||||
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
|
||||
- Extract matches with the lowest mess detected.
|
||||
- Additionally, we measure coherence / probe for a language.
|
||||
|
||||
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
|
||||
|
||||
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
||||
**I established** some ground rules about **what is obvious** when **it seems like** a mess.
|
||||
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
|
||||
improve or rewrite it.
|
||||
|
||||
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
||||
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
||||
|
||||
## ⚡ Known limitations
|
||||
|
||||
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
||||
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
||||
|
||||
## ⚠️ About Python EOLs
|
||||
|
||||
**If you are running:**
|
||||
|
||||
- Python >=2.7,<3.5: Unsupported
|
||||
- Python 3.5: charset-normalizer < 2.1
|
||||
- Python 3.6: charset-normalizer < 3.1
|
||||
- Python 3.7: charset-normalizer < 4.0
|
||||
|
||||
Upgrade your Python interpreter as soon as possible.
|
||||
|
||||
## 👤 Contributing
|
||||
|
||||
Contributions, issues and feature requests are very much welcome.<br />
|
||||
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
||||
|
||||
## 📝 License
|
||||
|
||||
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
||||
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
||||
|
||||
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
||||
|
||||
## 💼 For Enterprise
|
||||
|
||||
Professional support for charset-normalizer is available as part of the [Tidelift
|
||||
Subscription][1]. Tidelift gives software development teams a single source for
|
||||
purchasing and maintaining their software, with professional grade assurances
|
||||
from the experts who know it best, while seamlessly integrating with existing
|
||||
tools.
|
||||
|
||||
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
|
||||
|
||||
# Changelog
|
||||
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||
|
||||
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
|
||||
|
||||
### Fixed
|
||||
- Unintentional memory usage regression when using large payload that match several encoding (#376)
|
||||
- Regression on some detection case showcased in the documentation (#371)
|
||||
|
||||
### Added
|
||||
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
|
||||
|
||||
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
|
||||
|
||||
### Changed
|
||||
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
|
||||
- Improved the general detection reliability based on reports from the community
|
||||
|
||||
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
|
||||
|
||||
### Added
|
||||
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
|
||||
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
|
||||
|
||||
### Removed
|
||||
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
|
||||
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
|
||||
|
||||
### Changed
|
||||
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
|
||||
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
|
||||
|
||||
### Fixed
|
||||
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
|
||||
|
||||
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
|
||||
|
||||
### Changed
|
||||
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
|
||||
- Minor improvement over the global detection reliability
|
||||
|
||||
### Added
|
||||
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
|
||||
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
|
||||
- Explicit support for Python 3.12
|
||||
|
||||
### Fixed
|
||||
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
|
||||
|
||||
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
|
||||
|
||||
### Added
|
||||
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
|
||||
|
||||
### Removed
|
||||
- Support for Python 3.6 (PR #260)
|
||||
|
||||
### Changed
|
||||
- Optional speedup provided by mypy/c 1.0.1
|
||||
|
||||
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
|
||||
|
||||
### Fixed
|
||||
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
|
||||
|
||||
### Changed
|
||||
- Speedup provided by mypy/c 0.990 on Python >= 3.7
|
||||
|
||||
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
|
||||
|
||||
### Added
|
||||
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
||||
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
||||
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
||||
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
||||
|
||||
### Changed
|
||||
- Build with static metadata using 'build' frontend
|
||||
- Make the language detection stricter
|
||||
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
||||
|
||||
### Fixed
|
||||
- CLI with opt --normalize fail when using full path for files
|
||||
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
||||
- Sphinx warnings when generating the documentation
|
||||
|
||||
### Removed
|
||||
- Coherence detector no longer return 'Simple English' instead return 'English'
|
||||
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
||||
- Breaking: Method `first()` and `best()` from CharsetMatch
|
||||
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
||||
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
||||
- Breaking: Top-level function `normalize`
|
||||
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
||||
- Support for the backport `unicodedata2`
|
||||
|
||||
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
|
||||
|
||||
### Added
|
||||
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
||||
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
||||
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
||||
|
||||
### Changed
|
||||
- Build with static metadata using 'build' frontend
|
||||
- Make the language detection stricter
|
||||
|
||||
### Fixed
|
||||
- CLI with opt --normalize fail when using full path for files
|
||||
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
||||
|
||||
### Removed
|
||||
- Coherence detector no longer return 'Simple English' instead return 'English'
|
||||
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
||||
|
||||
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
|
||||
|
||||
### Added
|
||||
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
||||
|
||||
### Removed
|
||||
- Breaking: Method `first()` and `best()` from CharsetMatch
|
||||
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
||||
|
||||
### Fixed
|
||||
- Sphinx warnings when generating the documentation
|
||||
|
||||
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
|
||||
|
||||
### Changed
|
||||
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
||||
|
||||
### Removed
|
||||
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
||||
- Breaking: Top-level function `normalize`
|
||||
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
||||
- Support for the backport `unicodedata2`
|
||||
|
||||
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
|
||||
|
||||
### Deprecated
|
||||
- Function `normalize` scheduled for removal in 3.0
|
||||
|
||||
### Changed
|
||||
- Removed useless call to decode in fn is_unprintable (#206)
|
||||
|
||||
### Fixed
|
||||
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
|
||||
|
||||
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
|
||||
|
||||
### Added
|
||||
- Output the Unicode table version when running the CLI with `--version` (PR #194)
|
||||
|
||||
### Changed
|
||||
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
|
||||
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
|
||||
|
||||
### Fixed
|
||||
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
|
||||
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
|
||||
|
||||
### Removed
|
||||
- Support for Python 3.5 (PR #192)
|
||||
|
||||
### Deprecated
|
||||
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
|
||||
|
||||
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
|
||||
|
||||
### Fixed
|
||||
- ASCII miss-detection on rare cases (PR #170)
|
||||
|
||||
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
|
||||
|
||||
### Added
|
||||
- Explicit support for Python 3.11 (PR #164)
|
||||
|
||||
### Changed
|
||||
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
|
||||
|
||||
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
|
||||
|
||||
### Fixed
|
||||
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
|
||||
|
||||
### Changed
|
||||
- Skipping the language-detection (CD) on ASCII (PR #155)
|
||||
|
||||
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
|
||||
|
||||
### Changed
|
||||
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
|
||||
|
||||
### Fixed
|
||||
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
|
||||
|
||||
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
|
||||
### Changed
|
||||
- Improvement over Vietnamese detection (PR #126)
|
||||
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
|
||||
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
|
||||
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
|
||||
- Code style as refactored by Sourcery-AI (PR #131)
|
||||
- Minor adjustment on the MD around european words (PR #133)
|
||||
- Remove and replace SRTs from assets / tests (PR #139)
|
||||
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
||||
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
|
||||
|
||||
### Fixed
|
||||
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
|
||||
- Avoid using too insignificant chunk (PR #137)
|
||||
|
||||
### Added
|
||||
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
||||
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
|
||||
|
||||
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
|
||||
### Added
|
||||
- Add support for Kazakh (Cyrillic) language detection (PR #109)
|
||||
|
||||
### Changed
|
||||
- Further, improve inferring the language from a given single-byte code page (PR #112)
|
||||
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
|
||||
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
|
||||
- Various detection improvement (MD+CD) (PR #117)
|
||||
|
||||
### Removed
|
||||
- Remove redundant logging entry about detected language(s) (PR #115)
|
||||
|
||||
### Fixed
|
||||
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
|
||||
|
||||
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
|
||||
### Fixed
|
||||
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
|
||||
- Fix CLI crash when using --minimal output in certain cases (PR #103)
|
||||
|
||||
### Changed
|
||||
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
|
||||
|
||||
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
|
||||
### Changed
|
||||
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
|
||||
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
|
||||
- The Unicode detection is slightly improved (PR #93)
|
||||
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
|
||||
|
||||
### Removed
|
||||
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
|
||||
|
||||
### Fixed
|
||||
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
|
||||
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
|
||||
- The MANIFEST.in was not exhaustive (PR #78)
|
||||
|
||||
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
|
||||
### Fixed
|
||||
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
|
||||
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
|
||||
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
|
||||
- Submatch factoring could be wrong in rare edge cases (PR #72)
|
||||
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
|
||||
- Fix line endings from CRLF to LF for certain project files (PR #67)
|
||||
|
||||
### Changed
|
||||
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
|
||||
- Allow fallback on specified encoding if any (PR #71)
|
||||
|
||||
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
|
||||
### Changed
|
||||
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
|
||||
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
|
||||
|
||||
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
|
||||
### Fixed
|
||||
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
|
||||
|
||||
### Changed
|
||||
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
|
||||
|
||||
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
|
||||
### Fixed
|
||||
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
|
||||
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
|
||||
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
|
||||
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
|
||||
|
||||
### Changed
|
||||
- Public function normalize default args values were not aligned with from_bytes (PR #53)
|
||||
|
||||
### Added
|
||||
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
|
||||
|
||||
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
|
||||
### Changed
|
||||
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
|
||||
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
|
||||
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
|
||||
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
|
||||
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
|
||||
- utf_7 detection has been reinstated.
|
||||
|
||||
### Removed
|
||||
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
|
||||
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
|
||||
- The exception hook on UnicodeDecodeError has been removed.
|
||||
|
||||
### Deprecated
|
||||
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
|
||||
|
||||
### Fixed
|
||||
- The CLI output used the relative path of the file(s). Should be absolute.
|
||||
|
||||
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
|
||||
### Fixed
|
||||
- Logger configuration/usage no longer conflict with others (PR #44)
|
||||
|
||||
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
|
||||
### Removed
|
||||
- Using standard logging instead of using the package loguru.
|
||||
- Dropping nose test framework in favor of the maintained pytest.
|
||||
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
|
||||
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
|
||||
- Stop support for UTF-7 that does not contain a SIG.
|
||||
- Dropping PrettyTable, replaced with pure JSON output in CLI.
|
||||
|
||||
### Fixed
|
||||
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
|
||||
- Not searching properly for the BOM when trying utf32/16 parent codec.
|
||||
|
||||
### Changed
|
||||
- Improving the package final size by compressing frequencies.json.
|
||||
- Huge improvement over the larges payload.
|
||||
|
||||
### Added
|
||||
- CLI now produces JSON consumable output.
|
||||
- Return ASCII if given sequences fit. Given reasonable confidence.
|
||||
|
||||
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
|
||||
|
||||
### Fixed
|
||||
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
|
||||
|
||||
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
|
||||
|
||||
### Fixed
|
||||
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
|
||||
|
||||
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
|
||||
|
||||
### Fixed
|
||||
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
|
||||
|
||||
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
|
||||
|
||||
### Changed
|
||||
- Amend the previous release to allow prettytable 2.0 (PR #35)
|
||||
|
||||
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
|
||||
|
||||
### Fixed
|
||||
- Fix error while using the package with a python pre-release interpreter (PR #33)
|
||||
|
||||
### Changed
|
||||
- Dependencies refactoring, constraints revised.
|
||||
|
||||
### Added
|
||||
- Add python 3.9 and 3.10 to the supported interpreters
|
||||
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2019 TAHRI Ahmed R.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
+35
@@ -0,0 +1,35 @@
|
||||
../../bin/normalizer,sha256=O1tLXvRzeuQHDVSDjsuiUko8eeXdZtA_eGTgJcdT5qs,233
|
||||
charset_normalizer-3.3.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
charset_normalizer-3.3.2.dist-info/LICENSE,sha256=6zGgxaT7Cbik4yBV0lweX5w1iidS_vPNcgIT0cz-4kE,1070
|
||||
charset_normalizer-3.3.2.dist-info/METADATA,sha256=cfLhl5A6SI-F0oclm8w8ux9wshL1nipdeCdVnYb4AaA,33550
|
||||
charset_normalizer-3.3.2.dist-info/RECORD,,
|
||||
charset_normalizer-3.3.2.dist-info/WHEEL,sha256=4ZiCdXIWMxJyEClivrQv1QAHZpQh8kVYU92_ZAVwaok,152
|
||||
charset_normalizer-3.3.2.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
|
||||
charset_normalizer-3.3.2.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
||||
charset_normalizer/__init__.py,sha256=UzI3xC8PhmcLRMzSgPb6minTmRq0kWznnCBJ8ZCc2XI,1577
|
||||
charset_normalizer/__main__.py,sha256=JxY8bleaENOFlLRb9HfoeZCzAMnn2A1oGR5Xm2eyqg0,73
|
||||
charset_normalizer/__pycache__/__init__.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/__main__.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/api.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/cd.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/constant.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/legacy.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/md.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/models.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/utils.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/version.cpython-312.pyc,,
|
||||
charset_normalizer/api.py,sha256=WOlWjy6wT8SeMYFpaGbXZFN1TMXa-s8vZYfkL4G29iQ,21097
|
||||
charset_normalizer/cd.py,sha256=xwZliZcTQFA3jU0c00PRiu9MNxXTFxQkFLWmMW24ZzI,12560
|
||||
charset_normalizer/cli/__init__.py,sha256=D5ERp8P62llm2FuoMzydZ7d9rs8cvvLXqE-1_6oViPc,100
|
||||
charset_normalizer/cli/__main__.py,sha256=2F-xURZJzo063Ye-2RLJ2wcmURpbKeAzKwpiws65dAs,9744
|
||||
charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc,,
|
||||
charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc,,
|
||||
charset_normalizer/constant.py,sha256=p0IsOVcEbPWYPOdWhnhRbjK1YVBy6fs05C5vKC-zoxU,40481
|
||||
charset_normalizer/legacy.py,sha256=T-QuVMsMeDiQEk8WSszMrzVJg_14AMeSkmHdRYhdl1k,2071
|
||||
charset_normalizer/md.cpython-312-x86_64-linux-gnu.so,sha256=W654QTU3QZI6eWJ0fanScAr0_O6sL0I61fyRSdC-39Y,16064
|
||||
charset_normalizer/md.py,sha256=NkSuVLK13_a8c7BxZ4cGIQ5vOtGIWOdh22WZEvjp-7U,19624
|
||||
charset_normalizer/md__mypyc.cpython-312-x86_64-linux-gnu.so,sha256=IlObIV4dmRhFV8V7H-zK4rTxPzTSi9JmrWZD26JQfxI,272640
|
||||
charset_normalizer/models.py,sha256=I5i0s4aKCCgLPY2tUY3pwkgFA-BUbbNxQ7hVkVTt62s,11624
|
||||
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
charset_normalizer/utils.py,sha256=teiosMqzKjXyAHXnGdjSBOgnBZwx-SkBbCLrx0UXy8M,11894
|
||||
charset_normalizer/version.py,sha256=iHKUfHD3kDRSyrh_BN2ojh43TA5-UZQjvbVIEFfpHDs,79
|
||||
+6
@@ -0,0 +1,6 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.41.2)
|
||||
Root-Is-Purelib: false
|
||||
Tag: cp312-cp312-manylinux_2_17_x86_64
|
||||
Tag: cp312-cp312-manylinux2014_x86_64
|
||||
|
||||
+2
@@ -0,0 +1,2 @@
|
||||
[console_scripts]
|
||||
normalizer = charset_normalizer.cli:cli_detect
|
||||
+1
@@ -0,0 +1 @@
|
||||
charset_normalizer
|
||||
+46
@@ -0,0 +1,46 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Charset-Normalizer
|
||||
~~~~~~~~~~~~~~
|
||||
The Real First Universal Charset Detector.
|
||||
A library that helps you read text from an unknown charset encoding.
|
||||
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
||||
All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
Basic usage:
|
||||
>>> from charset_normalizer import from_bytes
|
||||
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
||||
>>> best_guess = results.best()
|
||||
>>> str(best_guess)
|
||||
'Bсеки човек има право на образование. Oбразованието!'
|
||||
|
||||
Others methods and usages are available - see the full documentation
|
||||
at <https://github.com/Ousret/charset_normalizer>.
|
||||
:copyright: (c) 2021 by Ahmed TAHRI
|
||||
:license: MIT, see LICENSE for more details.
|
||||
"""
|
||||
import logging
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, is_binary
|
||||
from .legacy import detect
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import set_logging_handler
|
||||
from .version import VERSION, __version__
|
||||
|
||||
__all__ = (
|
||||
"from_fp",
|
||||
"from_path",
|
||||
"from_bytes",
|
||||
"is_binary",
|
||||
"detect",
|
||||
"CharsetMatch",
|
||||
"CharsetMatches",
|
||||
"__version__",
|
||||
"VERSION",
|
||||
"set_logging_handler",
|
||||
)
|
||||
|
||||
# Attach a NullHandler to the top level logger by default
|
||||
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
||||
|
||||
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
||||
+4
@@ -0,0 +1,4 @@
|
||||
from .cli import cli_detect
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_detect()
|
||||
+626
@@ -0,0 +1,626 @@
|
||||
import logging
|
||||
from os import PathLike
|
||||
from typing import BinaryIO, List, Optional, Set, Union
|
||||
|
||||
from .cd import (
|
||||
coherence_ratio,
|
||||
encoding_languages,
|
||||
mb_encoding_languages,
|
||||
merge_coherence_ratios,
|
||||
)
|
||||
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
||||
from .md import mess_ratio
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import (
|
||||
any_specified_encoding,
|
||||
cut_sequence_chunks,
|
||||
iana_name,
|
||||
identify_sig_or_bom,
|
||||
is_cp_similar,
|
||||
is_multi_byte_encoding,
|
||||
should_strip_sig_or_bom,
|
||||
)
|
||||
|
||||
# Will most likely be controversial
|
||||
# logging.addLevelName(TRACE, "TRACE")
|
||||
logger = logging.getLogger("charset_normalizer")
|
||||
explain_handler = logging.StreamHandler()
|
||||
explain_handler.setFormatter(
|
||||
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
||||
)
|
||||
|
||||
|
||||
def from_bytes(
|
||||
sequences: Union[bytes, bytearray],
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.2,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
||||
If there is no results, it is a strong indicator that the source is binary/not text.
|
||||
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
||||
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
||||
|
||||
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
||||
but never take it for granted. Can improve the performance.
|
||||
|
||||
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
||||
purpose.
|
||||
|
||||
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
||||
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
||||
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
||||
Custom logging format and handler can be set manually.
|
||||
"""
|
||||
|
||||
if not isinstance(sequences, (bytearray, bytes)):
|
||||
raise TypeError(
|
||||
"Expected object of type bytes or bytearray, got: {0}".format(
|
||||
type(sequences)
|
||||
)
|
||||
)
|
||||
|
||||
if explain:
|
||||
previous_logger_level: int = logger.level
|
||||
logger.addHandler(explain_handler)
|
||||
logger.setLevel(TRACE)
|
||||
|
||||
length: int = len(sequences)
|
||||
|
||||
if length == 0:
|
||||
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level or logging.WARNING)
|
||||
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
||||
|
||||
if cp_isolation is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_isolation is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding allowed : %s.",
|
||||
", ".join(cp_isolation),
|
||||
)
|
||||
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
||||
else:
|
||||
cp_isolation = []
|
||||
|
||||
if cp_exclusion is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_exclusion is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding excluded : %s.",
|
||||
", ".join(cp_exclusion),
|
||||
)
|
||||
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
||||
else:
|
||||
cp_exclusion = []
|
||||
|
||||
if length <= (chunk_size * steps):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
||||
steps,
|
||||
chunk_size,
|
||||
length,
|
||||
)
|
||||
steps = 1
|
||||
chunk_size = length
|
||||
|
||||
if steps > 1 and length / steps < chunk_size:
|
||||
chunk_size = int(length / steps)
|
||||
|
||||
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
||||
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
||||
|
||||
if is_too_small_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
elif is_too_large_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
|
||||
prioritized_encodings: List[str] = []
|
||||
|
||||
specified_encoding: Optional[str] = (
|
||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
||||
)
|
||||
|
||||
if specified_encoding is not None:
|
||||
prioritized_encodings.append(specified_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
||||
specified_encoding,
|
||||
)
|
||||
|
||||
tested: Set[str] = set()
|
||||
tested_but_hard_failure: List[str] = []
|
||||
tested_but_soft_failure: List[str] = []
|
||||
|
||||
fallback_ascii: Optional[CharsetMatch] = None
|
||||
fallback_u8: Optional[CharsetMatch] = None
|
||||
fallback_specified: Optional[CharsetMatch] = None
|
||||
|
||||
results: CharsetMatches = CharsetMatches()
|
||||
|
||||
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
||||
|
||||
if sig_encoding is not None:
|
||||
prioritized_encodings.append(sig_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
||||
len(sig_payload),
|
||||
sig_encoding,
|
||||
)
|
||||
|
||||
prioritized_encodings.append("ascii")
|
||||
|
||||
if "utf_8" not in prioritized_encodings:
|
||||
prioritized_encodings.append("utf_8")
|
||||
|
||||
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
||||
if cp_isolation and encoding_iana not in cp_isolation:
|
||||
continue
|
||||
|
||||
if cp_exclusion and encoding_iana in cp_exclusion:
|
||||
continue
|
||||
|
||||
if encoding_iana in tested:
|
||||
continue
|
||||
|
||||
tested.add(encoding_iana)
|
||||
|
||||
decoded_payload: Optional[str] = None
|
||||
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
||||
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
||||
encoding_iana
|
||||
)
|
||||
|
||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s does not provide an IncrementalDecoder",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
if is_too_large_sequence and is_multi_byte_decoder is False:
|
||||
str(
|
||||
sequences[: int(50e4)]
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) : int(50e4)],
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
else:
|
||||
decoded_payload = str(
|
||||
sequences
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) :],
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
if not isinstance(e, LookupError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
similar_soft_failure_test: bool = False
|
||||
|
||||
for encoding_soft_failed in tested_but_soft_failure:
|
||||
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
||||
similar_soft_failure_test = True
|
||||
break
|
||||
|
||||
if similar_soft_failure_test:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
||||
encoding_iana,
|
||||
encoding_soft_failed,
|
||||
)
|
||||
continue
|
||||
|
||||
r_ = range(
|
||||
0 if not bom_or_sig_available else len(sig_payload),
|
||||
length,
|
||||
int(length / steps),
|
||||
)
|
||||
|
||||
multi_byte_bonus: bool = (
|
||||
is_multi_byte_decoder
|
||||
and decoded_payload is not None
|
||||
and len(decoded_payload) < length
|
||||
)
|
||||
|
||||
if multi_byte_bonus:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
||||
"was encoded using n-bytes.",
|
||||
encoding_iana,
|
||||
)
|
||||
|
||||
max_chunk_gave_up: int = int(len(r_) / 4)
|
||||
|
||||
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
||||
early_stop_count: int = 0
|
||||
lazy_str_hard_failure = False
|
||||
|
||||
md_chunks: List[str] = []
|
||||
md_ratios = []
|
||||
|
||||
try:
|
||||
for chunk in cut_sequence_chunks(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
r_,
|
||||
chunk_size,
|
||||
bom_or_sig_available,
|
||||
strip_sig_or_bom,
|
||||
sig_payload,
|
||||
is_multi_byte_decoder,
|
||||
decoded_payload,
|
||||
):
|
||||
md_chunks.append(chunk)
|
||||
|
||||
md_ratios.append(
|
||||
mess_ratio(
|
||||
chunk,
|
||||
threshold,
|
||||
explain is True and 1 <= len(cp_isolation) <= 2,
|
||||
)
|
||||
)
|
||||
|
||||
if md_ratios[-1] >= threshold:
|
||||
early_stop_count += 1
|
||||
|
||||
if (early_stop_count >= max_chunk_gave_up) or (
|
||||
bom_or_sig_available and strip_sig_or_bom is False
|
||||
):
|
||||
break
|
||||
except (
|
||||
UnicodeDecodeError
|
||||
) as e: # Lazy str loading may have missed something there
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
early_stop_count = max_chunk_gave_up
|
||||
lazy_str_hard_failure = True
|
||||
|
||||
# We might want to check the sequence again with the whole content
|
||||
# Only if initial MD tests passes
|
||||
if (
|
||||
not lazy_str_hard_failure
|
||||
and is_too_large_sequence
|
||||
and not is_multi_byte_decoder
|
||||
):
|
||||
try:
|
||||
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
||||
except UnicodeDecodeError as e:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
||||
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
||||
tested_but_soft_failure.append(encoding_iana)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
||||
"Computed mean chaos is %f %%.",
|
||||
encoding_iana,
|
||||
early_stop_count,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
# Preparing those fallbacks in case we got nothing.
|
||||
if (
|
||||
enable_fallback
|
||||
and encoding_iana in ["ascii", "utf_8", specified_encoding]
|
||||
and not lazy_str_hard_failure
|
||||
):
|
||||
fallback_entry = CharsetMatch(
|
||||
sequences, encoding_iana, threshold, False, [], decoded_payload
|
||||
)
|
||||
if encoding_iana == specified_encoding:
|
||||
fallback_specified = fallback_entry
|
||||
elif encoding_iana == "ascii":
|
||||
fallback_ascii = fallback_entry
|
||||
else:
|
||||
fallback_u8 = fallback_entry
|
||||
continue
|
||||
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
||||
encoding_iana,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
|
||||
if not is_multi_byte_decoder:
|
||||
target_languages: List[str] = encoding_languages(encoding_iana)
|
||||
else:
|
||||
target_languages = mb_encoding_languages(encoding_iana)
|
||||
|
||||
if target_languages:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"{} should target any language(s) of {}".format(
|
||||
encoding_iana, str(target_languages)
|
||||
),
|
||||
)
|
||||
|
||||
cd_ratios = []
|
||||
|
||||
# We shall skip the CD when its about ASCII
|
||||
# Most of the time its not relevant to run "language-detection" on it.
|
||||
if encoding_iana != "ascii":
|
||||
for chunk in md_chunks:
|
||||
chunk_languages = coherence_ratio(
|
||||
chunk,
|
||||
language_threshold,
|
||||
",".join(target_languages) if target_languages else None,
|
||||
)
|
||||
|
||||
cd_ratios.append(chunk_languages)
|
||||
|
||||
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
||||
|
||||
if cd_ratios_merged:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"We detected language {} using {}".format(
|
||||
cd_ratios_merged, encoding_iana
|
||||
),
|
||||
)
|
||||
|
||||
results.append(
|
||||
CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
mean_mess_ratio,
|
||||
bom_or_sig_available,
|
||||
cd_ratios_merged,
|
||||
decoded_payload,
|
||||
)
|
||||
)
|
||||
|
||||
if (
|
||||
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||
and mean_mess_ratio < 0.1
|
||||
):
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.", encoding_iana
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if encoding_iana == sig_encoding:
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
||||
"the beginning of the sequence.",
|
||||
encoding_iana,
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if len(results) == 0:
|
||||
if fallback_u8 or fallback_ascii or fallback_specified:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
||||
)
|
||||
|
||||
if fallback_specified:
|
||||
logger.debug(
|
||||
"Encoding detection: %s will be used as a fallback match",
|
||||
fallback_specified.encoding,
|
||||
)
|
||||
results.append(fallback_specified)
|
||||
elif (
|
||||
(fallback_u8 and fallback_ascii is None)
|
||||
or (
|
||||
fallback_u8
|
||||
and fallback_ascii
|
||||
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
||||
)
|
||||
or (fallback_u8 is not None)
|
||||
):
|
||||
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
||||
results.append(fallback_u8)
|
||||
elif fallback_ascii:
|
||||
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
||||
results.append(fallback_ascii)
|
||||
|
||||
if results:
|
||||
logger.debug(
|
||||
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
||||
results.best().encoding, # type: ignore
|
||||
len(results) - 1,
|
||||
)
|
||||
else:
|
||||
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
||||
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def from_fp(
|
||||
fp: BinaryIO,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but using a file pointer that is already ready.
|
||||
Will not close the file pointer.
|
||||
"""
|
||||
return from_bytes(
|
||||
fp.read(),
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
language_threshold,
|
||||
enable_fallback,
|
||||
)
|
||||
|
||||
|
||||
def from_path(
|
||||
path: Union[str, bytes, PathLike], # type: ignore[type-arg]
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
||||
Can raise IOError.
|
||||
"""
|
||||
with open(path, "rb") as fp:
|
||||
return from_fp(
|
||||
fp,
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
language_threshold,
|
||||
enable_fallback,
|
||||
)
|
||||
|
||||
|
||||
def is_binary(
|
||||
fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
|
||||
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
|
||||
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
|
||||
"""
|
||||
if isinstance(fp_or_path_or_payload, (str, PathLike)):
|
||||
guesses = from_path(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
elif isinstance(
|
||||
fp_or_path_or_payload,
|
||||
(
|
||||
bytes,
|
||||
bytearray,
|
||||
),
|
||||
):
|
||||
guesses = from_bytes(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
else:
|
||||
guesses = from_fp(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
|
||||
return not guesses
|
||||
+395
@@ -0,0 +1,395 @@
|
||||
import importlib
|
||||
from codecs import IncrementalDecoder
|
||||
from collections import Counter
|
||||
from functools import lru_cache
|
||||
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
|
||||
|
||||
from .constant import (
|
||||
FREQUENCIES,
|
||||
KO_NAMES,
|
||||
LANGUAGE_SUPPORTED_COUNT,
|
||||
TOO_SMALL_SEQUENCE,
|
||||
ZH_NAMES,
|
||||
)
|
||||
from .md import is_suspiciously_successive_range
|
||||
from .models import CoherenceMatches
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_latin,
|
||||
is_multi_byte_encoding,
|
||||
is_unicode_range_secondary,
|
||||
unicode_range,
|
||||
)
|
||||
|
||||
|
||||
def encoding_unicode_range(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Return associated unicode ranges in a single byte code page.
|
||||
"""
|
||||
if is_multi_byte_encoding(iana_name):
|
||||
raise IOError("Function not supported on multi-byte code page")
|
||||
|
||||
decoder = importlib.import_module(
|
||||
"encodings.{}".format(iana_name)
|
||||
).IncrementalDecoder
|
||||
|
||||
p: IncrementalDecoder = decoder(errors="ignore")
|
||||
seen_ranges: Dict[str, int] = {}
|
||||
character_count: int = 0
|
||||
|
||||
for i in range(0x40, 0xFF):
|
||||
chunk: str = p.decode(bytes([i]))
|
||||
|
||||
if chunk:
|
||||
character_range: Optional[str] = unicode_range(chunk)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
if is_unicode_range_secondary(character_range) is False:
|
||||
if character_range not in seen_ranges:
|
||||
seen_ranges[character_range] = 0
|
||||
seen_ranges[character_range] += 1
|
||||
character_count += 1
|
||||
|
||||
return sorted(
|
||||
[
|
||||
character_range
|
||||
for character_range in seen_ranges
|
||||
if seen_ranges[character_range] / character_count >= 0.15
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def unicode_range_languages(primary_range: str) -> List[str]:
|
||||
"""
|
||||
Return inferred languages used with a unicode range.
|
||||
"""
|
||||
languages: List[str] = []
|
||||
|
||||
for language, characters in FREQUENCIES.items():
|
||||
for character in characters:
|
||||
if unicode_range(character) == primary_range:
|
||||
languages.append(language)
|
||||
break
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def encoding_languages(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
|
||||
primary_range: Optional[str] = None
|
||||
|
||||
for specified_range in unicode_ranges:
|
||||
if "Latin" not in specified_range:
|
||||
primary_range = specified_range
|
||||
break
|
||||
|
||||
if primary_range is None:
|
||||
return ["Latin Based"]
|
||||
|
||||
return unicode_range_languages(primary_range)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def mb_encoding_languages(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
if (
|
||||
iana_name.startswith("shift_")
|
||||
or iana_name.startswith("iso2022_jp")
|
||||
or iana_name.startswith("euc_j")
|
||||
or iana_name == "cp932"
|
||||
):
|
||||
return ["Japanese"]
|
||||
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
||||
return ["Chinese"]
|
||||
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
||||
return ["Korean"]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
||||
def get_target_features(language: str) -> Tuple[bool, bool]:
|
||||
"""
|
||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
||||
"""
|
||||
target_have_accents: bool = False
|
||||
target_pure_latin: bool = True
|
||||
|
||||
for character in FREQUENCIES[language]:
|
||||
if not target_have_accents and is_accentuated(character):
|
||||
target_have_accents = True
|
||||
if target_pure_latin and is_latin(character) is False:
|
||||
target_pure_latin = False
|
||||
|
||||
return target_have_accents, target_pure_latin
|
||||
|
||||
|
||||
def alphabet_languages(
|
||||
characters: List[str], ignore_non_latin: bool = False
|
||||
) -> List[str]:
|
||||
"""
|
||||
Return associated languages associated to given characters.
|
||||
"""
|
||||
languages: List[Tuple[str, float]] = []
|
||||
|
||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
||||
|
||||
for language, language_characters in FREQUENCIES.items():
|
||||
target_have_accents, target_pure_latin = get_target_features(language)
|
||||
|
||||
if ignore_non_latin and target_pure_latin is False:
|
||||
continue
|
||||
|
||||
if target_have_accents is False and source_have_accents:
|
||||
continue
|
||||
|
||||
character_count: int = len(language_characters)
|
||||
|
||||
character_match_count: int = len(
|
||||
[c for c in language_characters if c in characters]
|
||||
)
|
||||
|
||||
ratio: float = character_match_count / character_count
|
||||
|
||||
if ratio >= 0.2:
|
||||
languages.append((language, ratio))
|
||||
|
||||
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
||||
|
||||
return [compatible_language[0] for compatible_language in languages]
|
||||
|
||||
|
||||
def characters_popularity_compare(
|
||||
language: str, ordered_characters: List[str]
|
||||
) -> float:
|
||||
"""
|
||||
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
||||
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
||||
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
||||
"""
|
||||
if language not in FREQUENCIES:
|
||||
raise ValueError("{} not available".format(language))
|
||||
|
||||
character_approved_count: int = 0
|
||||
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
||||
|
||||
ordered_characters_count: int = len(ordered_characters)
|
||||
target_language_characters_count: int = len(FREQUENCIES[language])
|
||||
|
||||
large_alphabet: bool = target_language_characters_count > 26
|
||||
|
||||
for character, character_rank in zip(
|
||||
ordered_characters, range(0, ordered_characters_count)
|
||||
):
|
||||
if character not in FREQUENCIES_language_set:
|
||||
continue
|
||||
|
||||
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
||||
expected_projection_ratio: float = (
|
||||
target_language_characters_count / ordered_characters_count
|
||||
)
|
||||
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
||||
|
||||
if (
|
||||
large_alphabet is False
|
||||
and abs(character_rank_projection - character_rank_in_language) > 4
|
||||
):
|
||||
continue
|
||||
|
||||
if (
|
||||
large_alphabet is True
|
||||
and abs(character_rank_projection - character_rank_in_language)
|
||||
< target_language_characters_count / 3
|
||||
):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
characters_before_source: List[str] = FREQUENCIES[language][
|
||||
0:character_rank_in_language
|
||||
]
|
||||
characters_after_source: List[str] = FREQUENCIES[language][
|
||||
character_rank_in_language:
|
||||
]
|
||||
characters_before: List[str] = ordered_characters[0:character_rank]
|
||||
characters_after: List[str] = ordered_characters[character_rank:]
|
||||
|
||||
before_match_count: int = len(
|
||||
set(characters_before) & set(characters_before_source)
|
||||
)
|
||||
|
||||
after_match_count: int = len(
|
||||
set(characters_after) & set(characters_after_source)
|
||||
)
|
||||
|
||||
if len(characters_before_source) == 0 and before_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if len(characters_after_source) == 0 and after_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if (
|
||||
before_match_count / len(characters_before_source) >= 0.4
|
||||
or after_match_count / len(characters_after_source) >= 0.4
|
||||
):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
return character_approved_count / len(ordered_characters)
|
||||
|
||||
|
||||
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
||||
"""
|
||||
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
||||
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
||||
One containing the latin letters and the other hebrew.
|
||||
"""
|
||||
layers: Dict[str, str] = {}
|
||||
|
||||
for character in decoded_sequence:
|
||||
if character.isalpha() is False:
|
||||
continue
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
layer_target_range: Optional[str] = None
|
||||
|
||||
for discovered_range in layers:
|
||||
if (
|
||||
is_suspiciously_successive_range(discovered_range, character_range)
|
||||
is False
|
||||
):
|
||||
layer_target_range = discovered_range
|
||||
break
|
||||
|
||||
if layer_target_range is None:
|
||||
layer_target_range = character_range
|
||||
|
||||
if layer_target_range not in layers:
|
||||
layers[layer_target_range] = character.lower()
|
||||
continue
|
||||
|
||||
layers[layer_target_range] += character.lower()
|
||||
|
||||
return list(layers.values())
|
||||
|
||||
|
||||
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
||||
"""
|
||||
This function merge results previously given by the function coherence_ratio.
|
||||
The return type is the same as coherence_ratio.
|
||||
"""
|
||||
per_language_ratios: Dict[str, List[float]] = {}
|
||||
for result in results:
|
||||
for sub_result in result:
|
||||
language, ratio = sub_result
|
||||
if language not in per_language_ratios:
|
||||
per_language_ratios[language] = [ratio]
|
||||
continue
|
||||
per_language_ratios[language].append(ratio)
|
||||
|
||||
merge = [
|
||||
(
|
||||
language,
|
||||
round(
|
||||
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
||||
4,
|
||||
),
|
||||
)
|
||||
for language in per_language_ratios
|
||||
]
|
||||
|
||||
return sorted(merge, key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
||||
"""
|
||||
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
||||
of "English". This function only keeps the best match and remove the em-dash in it.
|
||||
"""
|
||||
index_results: Dict[str, List[float]] = dict()
|
||||
|
||||
for result in results:
|
||||
language, ratio = result
|
||||
no_em_name: str = language.replace("—", "")
|
||||
|
||||
if no_em_name not in index_results:
|
||||
index_results[no_em_name] = []
|
||||
|
||||
index_results[no_em_name].append(ratio)
|
||||
|
||||
if any(len(index_results[e]) > 1 for e in index_results):
|
||||
filtered_results: CoherenceMatches = []
|
||||
|
||||
for language in index_results:
|
||||
filtered_results.append((language, max(index_results[language])))
|
||||
|
||||
return filtered_results
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def coherence_ratio(
|
||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
||||
) -> CoherenceMatches:
|
||||
"""
|
||||
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
||||
A layer = Character extraction by alphabets/ranges.
|
||||
"""
|
||||
|
||||
results: List[Tuple[str, float]] = []
|
||||
ignore_non_latin: bool = False
|
||||
|
||||
sufficient_match_count: int = 0
|
||||
|
||||
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
||||
if "Latin Based" in lg_inclusion_list:
|
||||
ignore_non_latin = True
|
||||
lg_inclusion_list.remove("Latin Based")
|
||||
|
||||
for layer in alpha_unicode_split(decoded_sequence):
|
||||
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
||||
most_common = sequence_frequencies.most_common()
|
||||
|
||||
character_count: int = sum(o for c, o in most_common)
|
||||
|
||||
if character_count <= TOO_SMALL_SEQUENCE:
|
||||
continue
|
||||
|
||||
popular_character_ordered: List[str] = [c for c, o in most_common]
|
||||
|
||||
for language in lg_inclusion_list or alphabet_languages(
|
||||
popular_character_ordered, ignore_non_latin
|
||||
):
|
||||
ratio: float = characters_popularity_compare(
|
||||
language, popular_character_ordered
|
||||
)
|
||||
|
||||
if ratio < threshold:
|
||||
continue
|
||||
elif ratio >= 0.8:
|
||||
sufficient_match_count += 1
|
||||
|
||||
results.append((language, round(ratio, 4)))
|
||||
|
||||
if sufficient_match_count >= 3:
|
||||
break
|
||||
|
||||
return sorted(
|
||||
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
||||
)
|
||||
+6
@@ -0,0 +1,6 @@
|
||||
from .__main__ import cli_detect, query_yes_no
|
||||
|
||||
__all__ = (
|
||||
"cli_detect",
|
||||
"query_yes_no",
|
||||
)
|
||||
+296
@@ -0,0 +1,296 @@
|
||||
import argparse
|
||||
import sys
|
||||
from json import dumps
|
||||
from os.path import abspath, basename, dirname, join, realpath
|
||||
from platform import python_version
|
||||
from typing import List, Optional
|
||||
from unicodedata import unidata_version
|
||||
|
||||
import charset_normalizer.md as md_module
|
||||
from charset_normalizer import from_fp
|
||||
from charset_normalizer.models import CliDetectionResult
|
||||
from charset_normalizer.version import __version__
|
||||
|
||||
|
||||
def query_yes_no(question: str, default: str = "yes") -> bool:
|
||||
"""Ask a yes/no question via input() and return their answer.
|
||||
|
||||
"question" is a string that is presented to the user.
|
||||
"default" is the presumed answer if the user just hits <Enter>.
|
||||
It must be "yes" (the default), "no" or None (meaning
|
||||
an answer is required of the user).
|
||||
|
||||
The "answer" return value is True for "yes" or False for "no".
|
||||
|
||||
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
||||
"""
|
||||
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
||||
if default is None:
|
||||
prompt = " [y/n] "
|
||||
elif default == "yes":
|
||||
prompt = " [Y/n] "
|
||||
elif default == "no":
|
||||
prompt = " [y/N] "
|
||||
else:
|
||||
raise ValueError("invalid default answer: '%s'" % default)
|
||||
|
||||
while True:
|
||||
sys.stdout.write(question + prompt)
|
||||
choice = input().lower()
|
||||
if default is not None and choice == "":
|
||||
return valid[default]
|
||||
elif choice in valid:
|
||||
return valid[choice]
|
||||
else:
|
||||
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
||||
|
||||
|
||||
def cli_detect(argv: Optional[List[str]] = None) -> int:
|
||||
"""
|
||||
CLI assistant using ARGV and ArgumentParser
|
||||
:param argv:
|
||||
:return: 0 if everything is fine, anything else equal trouble
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="The Real First Universal Charset Detector. "
|
||||
"Discover originating encoding used on text file. "
|
||||
"Normalize text to unicode."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="verbose",
|
||||
help="Display complementary information about file if any. "
|
||||
"Stdout will contain logs about the detection process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--with-alternative",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="alternatives",
|
||||
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--normalize",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="normalize",
|
||||
help="Permit to normalize input file. If not set, program does not write anything.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--minimal",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="minimal",
|
||||
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--replace",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="replace",
|
||||
help="Replace file when trying to normalize it instead of creating a new one.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--force",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="force",
|
||||
help="Replace file without asking if you are sure, use this flag with caution.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--threshold",
|
||||
action="store",
|
||||
default=0.2,
|
||||
type=float,
|
||||
dest="threshold",
|
||||
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
||||
__version__,
|
||||
python_version(),
|
||||
unidata_version,
|
||||
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
||||
),
|
||||
help="Show version information and exit.",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.replace is True and args.normalize is False:
|
||||
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.force is True and args.replace is False:
|
||||
print("Use --force in addition of --replace only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.threshold < 0.0 or args.threshold > 1.0:
|
||||
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
x_ = []
|
||||
|
||||
for my_file in args.files:
|
||||
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
|
||||
|
||||
best_guess = matches.best()
|
||||
|
||||
if best_guess is None:
|
||||
print(
|
||||
'Unable to identify originating encoding for "{}". {}'.format(
|
||||
my_file.name,
|
||||
"Maybe try increasing maximum amount of chaos."
|
||||
if args.threshold < 1.0
|
||||
else "",
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
None,
|
||||
[],
|
||||
[],
|
||||
"Unknown",
|
||||
[],
|
||||
False,
|
||||
1.0,
|
||||
0.0,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
else:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
best_guess.encoding,
|
||||
best_guess.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in best_guess.could_be_from_charset
|
||||
if cp != best_guess.encoding
|
||||
],
|
||||
best_guess.language,
|
||||
best_guess.alphabets,
|
||||
best_guess.bom,
|
||||
best_guess.percent_chaos,
|
||||
best_guess.percent_coherence,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
|
||||
if len(matches) > 1 and args.alternatives:
|
||||
for el in matches:
|
||||
if el != best_guess:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
el.encoding,
|
||||
el.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in el.could_be_from_charset
|
||||
if cp != el.encoding
|
||||
],
|
||||
el.language,
|
||||
el.alphabets,
|
||||
el.bom,
|
||||
el.percent_chaos,
|
||||
el.percent_coherence,
|
||||
None,
|
||||
False,
|
||||
)
|
||||
)
|
||||
|
||||
if args.normalize is True:
|
||||
if best_guess.encoding.startswith("utf") is True:
|
||||
print(
|
||||
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
||||
my_file.name
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
dir_path = dirname(realpath(my_file.name))
|
||||
file_name = basename(realpath(my_file.name))
|
||||
|
||||
o_: List[str] = file_name.split(".")
|
||||
|
||||
if args.replace is False:
|
||||
o_.insert(-1, best_guess.encoding)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
elif (
|
||||
args.force is False
|
||||
and query_yes_no(
|
||||
'Are you sure to normalize "{}" by replacing it ?'.format(
|
||||
my_file.name
|
||||
),
|
||||
"no",
|
||||
)
|
||||
is False
|
||||
):
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
try:
|
||||
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
||||
|
||||
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
||||
fp.write(str(best_guess))
|
||||
except IOError as e:
|
||||
print(str(e), file=sys.stderr)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
return 2
|
||||
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
|
||||
if args.minimal is False:
|
||||
print(
|
||||
dumps(
|
||||
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
||||
ensure_ascii=True,
|
||||
indent=4,
|
||||
)
|
||||
)
|
||||
else:
|
||||
for my_file in args.files:
|
||||
print(
|
||||
", ".join(
|
||||
[
|
||||
el.encoding or "undefined"
|
||||
for el in x_
|
||||
if el.path == abspath(my_file.name)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_detect()
|
||||
+1995
File diff suppressed because it is too large
Load Diff
+54
@@ -0,0 +1,54 @@
|
||||
from typing import Any, Dict, Optional, Union
|
||||
from warnings import warn
|
||||
|
||||
from .api import from_bytes
|
||||
from .constant import CHARDET_CORRESPONDENCE
|
||||
|
||||
|
||||
def detect(
|
||||
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
||||
) -> Dict[str, Optional[Union[str, float]]]:
|
||||
"""
|
||||
chardet legacy method
|
||||
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
||||
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
||||
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
||||
further information. Not planned for removal.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
:param should_rename_legacy: Should we rename legacy encodings
|
||||
to their more modern equivalents?
|
||||
"""
|
||||
if len(kwargs):
|
||||
warn(
|
||||
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
||||
)
|
||||
|
||||
if not isinstance(byte_str, (bytearray, bytes)):
|
||||
raise TypeError( # pragma: nocover
|
||||
"Expected object of type bytes or bytearray, got: "
|
||||
"{0}".format(type(byte_str))
|
||||
)
|
||||
|
||||
if isinstance(byte_str, bytearray):
|
||||
byte_str = bytes(byte_str)
|
||||
|
||||
r = from_bytes(byte_str).best()
|
||||
|
||||
encoding = r.encoding if r is not None else None
|
||||
language = r.language if r is not None and r.language != "Unknown" else ""
|
||||
confidence = 1.0 - r.chaos if r is not None else None
|
||||
|
||||
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
||||
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
||||
if r is not None and encoding == "utf_8" and r.bom:
|
||||
encoding += "_sig"
|
||||
|
||||
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
||||
encoding = CHARDET_CORRESPONDENCE[encoding]
|
||||
|
||||
return {
|
||||
"encoding": encoding,
|
||||
"language": language,
|
||||
"confidence": confidence,
|
||||
}
|
||||
BIN
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user