commit 128a1d6854ded458ea16c9d57e5cf10f4ae0141c Author: Liam Pietralla Date: Sat Dec 28 21:03:41 2024 +1100 initial commit diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..aa9be4b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +bin/ +include/ +lib/ +lib64/ +pyvenv.cfg \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f452ff5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# VENV +bin/ +include/ +lib/ +lib64/ +pyvenv.cfg \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d670635 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.12-slim AS base + +WORKDIR /app + +COPY requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +COPY main.py . + +CMD ["python", "main.py"] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..007ca6e --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +# S3 Recent File Cleaner + +Simple python script to loop through all files in an S3 Endpoint and delete excess files based on a retention amount. + +Retention ammount: 5 + +## Development + +Python 3.12 should be used for development and ideally a virtual environment should be used. With the source checked out run the following command to create a virtual environment. + +```bash +python3 -m venv ./ +``` + +The required packages can be installed by running the following command after activating the virtual environment. + +```bash +pip install -r requirements.txt +``` + +If new packages are installed, the requirements file can be updated by running the following command. + +```bash +pip freeze > requirements.txt +``` + +## Usage + +The S3 Recent File Cleaner is best run as a once off docker image. The image is already built so can be run with the following command. + +```bash +docker run --rm -e S3_ACCESS_KEY=your_access_key -e S3_SECRET_KEY=your_secret_key -e S3_ENDPOINT=your_endpoint s3-recent-file-cleaner +``` \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..ff77585 --- /dev/null +++ b/main.py @@ -0,0 +1,84 @@ +import sys +import boto3 +import os + +DEFAULT = '' +S3_ENDPOINT = os.getenv('S3_ENDPOINT', DEFAULT) +S3_ACCESS_KEY = os.getenv('S3_ACCESS_KEY', DEFAULT) +S3_SECRET_KEY = os.getenv('S3_SECRET_KEY', DEFAULT) +DEFAULT_FILES_TO_KEEP = 5 + +def validate_env_vars(): + if S3_ENDPOINT == '': + print('S3_ENDPOINT is not set') + sys.exit(1) + + if S3_ACCESS_KEY == '': + print('S3_ACCESS_KEY is not set') + sys.exit(1) + + if S3_SECRET_KEY == '': + print('S3_SECRET_KEY is not set') + sys.exit(1) + +def clean_files(): + print('Cleaning files...') + + buckets = get_buckets() + + if len(buckets) == 0: + print('No buckets found') + return + + # Loop through each bucket and check how many files are in it + for bucket in buckets: + try: + bucket_name = bucket['Name'] + print(f'Bucket: {bucket_name}') + + # Get all files in the bucket + s3 = boto3.client('s3', endpoint_url=S3_ENDPOINT, aws_access_key_id=S3_ACCESS_KEY, aws_secret_access_key=S3_SECRET_KEY) + response = s3.list_objects_v2(Bucket=bucket_name) + files = response['Contents'] + + # Check if the number of files is greater than the number of files to keep + if len(files) > DEFAULT_FILES_TO_KEEP: + # Sort the files by date + files.sort(key=lambda x: x['LastModified']) + + # Delete the oldest files + for i in range(len(files) - DEFAULT_FILES_TO_KEEP): + try: + file = files[i] + file_name = file['Key'] + print(f'Deleting file: {file_name}') + s3.delete_object(Bucket=bucket_name, Key=file_name) + print(f'File deleted: {file_name}') + except Exception as e: + print(f'Error deleting file: {file_name}') + print(e) + else: + print('No files to delete') + except Exception as e: + print('Error processing bucket') + print(e) + + print('Files cleaned') + +def get_buckets(): + try: + # Get all buckets for this endpoint + s3 = boto3.client('s3', endpoint_url=S3_ENDPOINT, aws_access_key_id=S3_ACCESS_KEY, aws_secret_access_key=S3_SECRET_KEY) + response = s3.list_buckets() + return response['Buckets'] + except Exception as e: + print('Error getting buckets') + print(e) + sys.exit(1) + +if __name__ == '__main__': + # Validate the environment variables + validate_env_vars() + + # Clean the files + clean_files() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5c754a3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +boto3==1.35.90 +botocore==1.35.90 +jmespath==1.0.1 +python-dateutil==2.9.0.post0 +s3transfer==0.10.4 +six==1.17.0 +urllib3==2.3.0