From be3918cbafa1160e4c0dea45a0ffeed767c39dd9 Mon Sep 17 00:00:00 2001 From: Patrick Fic Date: Fri, 31 Oct 2025 15:43:22 -0700 Subject: [PATCH] WIP s3 sync and directory stats. --- .dockerignore | 15 +- Dockerfile | 19 +- S3_SYNC_README.md | 197 ++++++++++++++++++++ docker-compose.yml | 6 + docker-create.md | 78 ++++++++ package-lock.json | 29 +++ package.json | 2 + server.ts | 43 +++++ util/dailyS3Scheduler.ts | 154 ++++++++++++++++ util/s3Sync.ts | 388 +++++++++++++++++++++++++++++++++++++++ 10 files changed, 926 insertions(+), 5 deletions(-) create mode 100644 S3_SYNC_README.md create mode 100644 util/dailyS3Scheduler.ts create mode 100644 util/s3Sync.ts diff --git a/.dockerignore b/.dockerignore index 5171c54..90ee08c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,15 @@ node_modules -npm-debug.log \ No newline at end of file +npm-debug.log +.git +.gitignore +README.md +S3_SYNC_README.md +docker-create.md +.env* +!.env.production +*.log +dist +.DS_Store +.vscode +coverage +.nyc_output \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index aae6cf9..9109764 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,8 +2,12 @@ # Build stage for libraries FROM node:22-alpine AS builder -# Install build dependencies -RUN apk add --no-cache bash wget +# Install build dependencies including AWS CLI v2 +RUN apk add --no-cache bash wget curl unzip gcompat +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \ + unzip awscliv2.zip && \ + ./aws/install && \ + rm -rf awscliv2.zip aws/ # Node.js application build stage WORKDIR /usr/src/app @@ -21,8 +25,15 @@ FROM node:22-alpine RUN echo "https://dl-cdn.alpinelinux.org/alpine/v$(grep -oE '[0-9]+\.[0-9]+' /etc/alpine-release)/community" >> /etc/apk/repositories RUN apk update -# Install runtime dependencies only -RUN apk add --no-cache bash redis ghostscript graphicsmagick imagemagick libjpeg-turbo libpng libwebp tiff libheif libde265 x265 ffmpeg +# Install runtime dependencies including AWS CLI v2 +RUN apk add --no-cache bash redis ghostscript graphicsmagick imagemagick libjpeg-turbo libpng libwebp tiff libheif libde265 x265 ffmpeg curl unzip gcompat tzdata + +# Install AWS CLI v2 +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \ + unzip awscliv2.zip && \ + ./aws/install && \ + rm -rf awscliv2.zip aws/ && \ + aws --version RUN npm install -g pm2 diff --git a/S3_SYNC_README.md b/S3_SYNC_README.md new file mode 100644 index 0000000..aa06473 --- /dev/null +++ b/S3_SYNC_README.md @@ -0,0 +1,197 @@ +# S3 Daily Sync Configuration + +This application now includes automatic daily synchronization of the Jobs directory to an S3 bucket using the AWS CLI. + +## Prerequisites + +### AWS CLI Installation +The sync functionality requires the AWS CLI to be installed on your system: + +**macOS:** +```bash +curl "https://awscli.amazonaws.com/AWSCLIV2.pkg" -o "AWSCLIV2.pkg" +sudo installer -pkg AWSCLIV2.pkg -target / +``` + +**Linux:** +```bash +curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" +unzip awscliv2.zip +sudo ./aws/install +``` + +**Docker (if running in container):** +```dockerfile +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \ + unzip awscliv2.zip && \ + ./aws/install && \ + rm -rf awscliv2.zip aws/ +``` + +## Required Environment Variables + +Add the following environment variables to your `.env` file: + +```env +# S3 Configuration (Required for daily sync) +S3_BUCKET_NAME=your-s3-bucket-name +S3_REGION=us-east-1 +S3_ACCESS_KEY_ID=your-access-key-id +S3_SECRET_ACCESS_KEY=your-secret-access-key + +# Optional: S3 key prefix (defaults to "jobs/" if not specified) +S3_KEY_PREFIX=jobs/ +``` + +## Features + +### Automatic Daily Sync +- Runs every day at midnight PST/PDT using `aws s3 sync` +- Uses the `--delete` flag to remove files from S3 that no longer exist locally +- Efficient incremental sync (only uploads changed files) +- Comprehensive logging of sync operations + +### Jobs Directory Analysis +- Real-time analysis of all job folders in the Jobs directory +- Recursive document counting and size calculation +- Detailed per-job statistics including document counts and sizes +- Useful for monitoring storage usage and job completion status +- No S3 configuration required for analysis functionality + +### API Endpoints + +#### Check Sync Status +``` +GET /sync/status +``` +Returns the current status of the S3 sync scheduler including: +- Configuration status +- Scheduler running status +- Next scheduled run time +- S3 connection availability + +#### Manual Sync Trigger +``` +POST /sync/trigger +``` +Manually triggers an S3 sync operation (useful for testing). + +#### Jobs Directory Analysis +``` +GET /jobs/analysis +``` +Analyzes the Jobs directory and returns detailed statistics: +- Total number of job folders +- Total documents across all jobs +- Total size in bytes and MB +- Per-job statistics including: + - Job ID (folder name) + - Relative path + - Document count in that job + - Total size for that job + +**Example Response:** +```json +{ + "totalJobs": 150, + "totalDocuments": 1250, + "totalSizeBytes": 2147483648, + "totalSizeMB": 2048.0, + "jobs": [ + { + "jobId": "JOB-001", + "relativePath": "Jobs/JOB-001", + "documentCount": 8, + "totalSizeBytes": 15728640, + "totalSizeMB": 15.0 + }, + ... + ] +} +``` + +## Setup Instructions + +1. **Install AWS CLI**: Follow the installation instructions above for your platform + +2. **Configure S3 Bucket**: + - Create an S3 bucket in your AWS account + - Create an IAM user with S3 permissions for the bucket + - Generate access keys for the IAM user + +3. **Set Environment Variables**: + - Add the S3 configuration to your environment file + - Restart the server + +4. **Test the Setup**: + - Check the sync status: `GET /s3-sync/status` + - Trigger a manual sync: `POST /s3-sync/trigger` + - Monitor the logs for sync operations + +## IAM Permissions + +Your IAM user needs the following permissions for the S3 bucket: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::your-bucket-name", + "arn:aws:s3:::your-bucket-name/*" + ] + } + ] +} +``` + +## Advantages of Using AWS CLI + +### Why AWS CLI vs SDK? +- **Simplicity**: Single command (`aws s3 sync`) handles everything +- **Efficiency**: AWS CLI is optimized for bulk operations +- **Robustness**: Built-in retry logic and error handling +- **Features**: Automatic multipart uploads, checksums, and progress tracking +- **Maintenance**: No need to manage complex SDK code for file operations + +### AWS CLI Sync Command +The sync uses: `aws s3 sync /local/jobs/path s3://bucket/jobs/ --delete` + +This command: +- Only uploads files that are new or modified (based on size and timestamp) +- Automatically handles large files with multipart uploads +- Deletes files from S3 that no longer exist locally (with `--delete` flag) +- Provides detailed output of what was transferred + +## Troubleshooting + +- **AWS CLI not found**: Install AWS CLI using the instructions above +- **Permission denied**: Check IAM permissions and access keys +- **Sync fails**: Check the application logs for detailed error messages +- **Connection issues**: Verify S3 bucket name and region +- **Test connection**: Use the status endpoint to verify S3 connectivity + +## How It Works + +1. **Scheduler**: Uses `node-cron` to schedule daily execution at midnight PST +2. **AWS CLI Check**: Verifies AWS CLI is installed and available +3. **Credential Setup**: Sets AWS credentials as environment variables +4. **Sync Execution**: Runs `aws s3 sync` with appropriate parameters +5. **Logging**: Captures and logs all command output and errors + +## Dependencies + +The implementation now only requires: +- `node-cron` for scheduling +- `fs-extra` for file system operations +- Node.js built-in `child_process` for executing AWS CLI commands + +No AWS SDK dependencies are needed! \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 5da2a5f..0d81fc2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,4 +12,10 @@ services: - IMS_TOKEN= - CONVERT_QUALITY=0.5 - KEEP_CONVERTED_ORIGINALS=TRUE + # S3 Sync Configuration (uncomment and configure for daily sync) + # - S3_BUCKET_NAME=your-s3-bucket-name + # - S3_REGION=us-east-1 + # - S3_ACCESS_KEY_ID=your-access-key-id + # - S3_SECRET_ACCESS_KEY=your-secret-access-key + # - S3_KEY_PREFIX=jobs/ image: imexonline/media-server:beta \ No newline at end of file diff --git a/docker-create.md b/docker-create.md index 25deff2..17535c2 100644 --- a/docker-create.md +++ b/docker-create.md @@ -1,2 +1,80 @@ +# Docker Deployment Guide + +## Building the Image + +```bash +# Build latest version docker build . -t imexonline/media-server:latest -t imexonline/media-server:1.0 + +# Build beta version docker build . -t imexonline/media-server:beta +``` + +## Docker Image Features + +The Docker image includes: +- Node.js 22 Alpine base +- AWS CLI v2 for S3 sync functionality +- GraphicsMagick, ImageMagick, and FFmpeg for media processing +- Redis for background job processing +- PM2 for process management +- PST timezone configuration for scheduled tasks + +## Environment Variables + +### Required +- `MEDIA_PATH` - Path to media storage directory +- `IMS_TOKEN` - Authentication token + +### Optional +- `DUPLICATE_BILL_TO_VENDOR` - Whether to duplicate bills to vendor directory +- `CONVERT_QUALITY` - Image conversion quality (0.0-1.0) +- `KEEP_CONVERTED_ORIGINALS` - Whether to keep original files after conversion + +### S3 Sync (Optional) +- `S3_BUCKET_NAME` - S3 bucket name for daily sync +- `S3_REGION` - AWS region (default: us-east-1) +- `S3_ACCESS_KEY_ID` - AWS access key +- `S3_SECRET_ACCESS_KEY` - AWS secret key +- `S3_KEY_PREFIX` - S3 key prefix (default: jobs/) + +## Running with Docker Compose + +1. Update `docker-compose.yml` with your configuration +2. Uncomment and configure S3 environment variables if needed +3. Run: `docker-compose up -d` + +## Manual Docker Run + +```bash +docker run -d \ + --name bodyshop-media-server \ + -p 8000:8000 \ + -v "/path/to/media:/media" \ + -e MEDIA_PATH=/media \ + -e IMS_TOKEN=your-token \ + -e S3_BUCKET_NAME=your-bucket \ + -e S3_ACCESS_KEY_ID=your-key \ + -e S3_SECRET_ACCESS_KEY=your-secret \ + imexonline/media-server:latest +``` + +## S3 Sync in Docker + +The S3 sync functionality works automatically in Docker: +- AWS CLI v2 is pre-installed +- Timezone is set to PST for midnight scheduling +- Daily sync runs at midnight PST automatically +- Check sync status via API: `GET /s3-sync/status` + +## Health Checks + +- Main health check: `GET /health` +- Application status: `GET /` (requires token) +- S3 sync status: `GET /s3-sync/status` (requires token) + +## Logs + +View container logs: `docker logs bodyshop-media-server` + +The application uses structured logging with daily rotation. diff --git a/package-lock.json b/package-lock.json index 838dd4f..25f4e4c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -26,6 +26,7 @@ "morgan": "^1.10.1", "multer": "^2.0.2", "nocache": "^4.0.0", + "node-cron": "^4.2.1", "response-time": "^2.3.4", "simple-thumbnail": "^1.6.5", "winston": "^3.17.0", @@ -40,6 +41,7 @@ "@types/morgan": "^1.9.10", "@types/multer": "^2.0.0", "@types/node": "^24.1.0", + "@types/node-cron": "^3.0.11", "@types/response-time": "^2.3.9", "nodemon": "^3.1.10", "prettier": "^3.6.2", @@ -745,6 +747,13 @@ "undici-types": "~7.8.0" } }, + "node_modules/@types/node-cron": { + "version": "3.0.11", + "resolved": "https://registry.npmjs.org/@types/node-cron/-/node-cron-3.0.11.tgz", + "integrity": "sha512-0ikrnug3/IyneSHqCBeslAhlK2aBfYek1fGo4bP4QnZPmiqSGRK+Oy7ZMisLWkesffJvQ1cqAcBnJC+8+nxIAg==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/qs": { "version": "6.9.18", "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.9.18.tgz", @@ -2563,6 +2572,15 @@ "integrity": "sha512-AGK2yQKIjRuqnc6VkX2Xj5d+QW8xZ87pa1UK6yA6ouUyuxfHuMP6umE5QK7UmTeOAymo+Zx1Fxiuw9rVx8taHQ==", "license": "MIT" }, + "node_modules/node-cron": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/node-cron/-/node-cron-4.2.1.tgz", + "integrity": "sha512-lgimEHPE/QDgFlywTd8yTR61ptugX3Qer29efeyWw2rv259HtGBNn1vZVmp8lB9uo9wC0t/AT4iGqXxia+CJFg==", + "license": "ISC", + "engines": { + "node": ">=6.0.0" + } + }, "node_modules/node-gyp-build-optional-packages": { "version": "5.2.2", "resolved": "https://registry.npmjs.org/node-gyp-build-optional-packages/-/node-gyp-build-optional-packages-5.2.2.tgz", @@ -4091,6 +4109,12 @@ "undici-types": "~7.8.0" } }, + "@types/node-cron": { + "version": "3.0.11", + "resolved": "https://registry.npmjs.org/@types/node-cron/-/node-cron-3.0.11.tgz", + "integrity": "sha512-0ikrnug3/IyneSHqCBeslAhlK2aBfYek1fGo4bP4QnZPmiqSGRK+Oy7ZMisLWkesffJvQ1cqAcBnJC+8+nxIAg==", + "dev": true + }, "@types/qs": { "version": "6.9.18", "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.9.18.tgz", @@ -5353,6 +5377,11 @@ "resolved": "https://registry.npmjs.org/node-abort-controller/-/node-abort-controller-3.1.1.tgz", "integrity": "sha512-AGK2yQKIjRuqnc6VkX2Xj5d+QW8xZ87pa1UK6yA6ouUyuxfHuMP6umE5QK7UmTeOAymo+Zx1Fxiuw9rVx8taHQ==" }, + "node-cron": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/node-cron/-/node-cron-4.2.1.tgz", + "integrity": "sha512-lgimEHPE/QDgFlywTd8yTR61ptugX3Qer29efeyWw2rv259HtGBNn1vZVmp8lB9uo9wC0t/AT4iGqXxia+CJFg==" + }, "node-gyp-build-optional-packages": { "version": "5.2.2", "resolved": "https://registry.npmjs.org/node-gyp-build-optional-packages/-/node-gyp-build-optional-packages-5.2.2.tgz", diff --git a/package.json b/package.json index b2a45f5..2ff37b5 100644 --- a/package.json +++ b/package.json @@ -30,6 +30,7 @@ "morgan": "^1.10.1", "multer": "^2.0.2", "nocache": "^4.0.0", + "node-cron": "^4.2.1", "response-time": "^2.3.4", "simple-thumbnail": "^1.6.5", "winston": "^3.17.0", @@ -44,6 +45,7 @@ "@types/morgan": "^1.9.10", "@types/multer": "^2.0.0", "@types/node": "^24.1.0", + "@types/node-cron": "^3.0.11", "@types/response-time": "^2.3.9", "nodemon": "^3.1.10", "prettier": "^3.6.2", diff --git a/server.ts b/server.ts index f049b44..ce68635 100644 --- a/server.ts +++ b/server.ts @@ -21,6 +21,8 @@ import { JobsMoveMedia } from "./jobs/jobsMoveMedia.js"; import { JobMediaUploadMulter, jobsUploadMedia } from "./jobs/jobsUploadMedia.js"; import InitServer, { FolderPaths } from "./util/serverInit.js"; import ValidateImsToken from "./util/validateToken.js"; +import { dailyS3Scheduler } from "./util/dailyS3Scheduler.js"; +import { analyzeJobsDirectory } from "./util/s3Sync.js"; dotenv.config({ path: resolve(process.cwd(), `.env.${process.env.NODE_ENV || "development"}`) @@ -132,11 +134,52 @@ app.get("/health", (req, res) => { res.status(200).send("OK"); }); +// S3 sync status endpoint +app.get("/sync/status", ValidateImsToken, async (req, res) => { + try { + const status = await dailyS3Scheduler.getStatus(); + res.json(status); + } catch (error) { + logger.error("Failed to get sync status:", error); + const errorMessage = error instanceof Error ? error.message : "Unknown error"; + res.status(500).json({ error: errorMessage }); + } +}); + +// Manual S3 sync trigger endpoint (for testing) +app.post("/sync/trigger", ValidateImsToken, async (req, res) => { + try { + await dailyS3Scheduler.triggerManualSync(); + res.json({ success: true, message: "Manual sync triggered successfully" }); + } catch (error) { + logger.error("Manua--l sync failed:", error); + const errorMessage = error instanceof Error ? error.message : "Unknown error"; + res.status(500).json({ success: false, message: "Manual sync failed", error: errorMessage }); + } +}); + +// Jobs directory analysis endpoint +app.get("/jobs/analysis", ValidateImsToken, async (req, res) => { + try { + const analysis = await analyzeJobsDirectory(); + res.json(analysis); + } catch (error) { + logger.error("Failed to analyze jobs directory:", error); + const errorMessage = error instanceof Error ? error.message : "Unknown error"; + res.status(500).json({ success: false, message: "Jobs analysis failed", error: errorMessage }); + } +}); + // Static files InitServer(); app.use(FolderPaths.StaticPath, express.static(FolderPaths.Root, { etag: false, maxAge: 30 * 1000 })); app.use("/assets", express.static("/assets", { etag: false, maxAge: 30 * 1000 })); +// Start the daily S3 sync scheduler +dailyS3Scheduler.start().catch((error) => { + logger.error("Failed to start sync scheduler:", error); +}); + app.listen(port, () => { logger.info(`ImEX Media Server is running at http://localhost:${port}`); }); diff --git a/util/dailyS3Scheduler.ts b/util/dailyS3Scheduler.ts new file mode 100644 index 0000000..9226ab0 --- /dev/null +++ b/util/dailyS3Scheduler.ts @@ -0,0 +1,154 @@ +import * as cron from "node-cron"; +import { logger } from "../server.js"; +import { S3Sync, createS3SyncFromEnv } from "./s3Sync.js"; + +export class DailyS3Scheduler { + private s3Sync: S3Sync | null = null; + private cronJob: cron.ScheduledTask | null = null; + + constructor() { + this.s3Sync = createS3SyncFromEnv(); + } + + /** + * Start the daily S3 sync scheduler + * Runs at midnight PST (00:00 PST = 08:00 UTC during standard time, 07:00 UTC during daylight time) + */ + async start(): Promise { + if (!this.s3Sync) { + logger.warn("S3 sync not configured. Skipping scheduler setup."); + return; + } + + // Test S3 connection before starting scheduler + const connectionTest = await this.s3Sync.testConnection(); + if (!connectionTest) { + logger.error("S3 connection test failed. S3 sync scheduler will not be started."); + return; + } + + // Cron expression for midnight PST + // Note: This uses PST timezone. During PDT (daylight time), it will still run at midnight local time + const cronExpression = "0 0 * * *"; // Every day at midnight + const timezone = "America/Los_Angeles"; // PST/PDT timezone + + this.cronJob = cron.schedule( + cronExpression, + async () => { + await this.performDailySync(); + }, + { + timezone: timezone, + } + ); + + logger.info(`Daily S3 sync scheduler started. Will run at midnight PST/PDT.`); + logger.info(`Next sync scheduled for: ${this.getNextRunTime()}`); + } + + /** + * Stop the scheduler + */ + stop(): void { + if (this.cronJob) { + this.cronJob.stop(); + this.cronJob = null; + logger.info("Daily S3 sync scheduler stopped."); + } + } + + /** + * Perform the daily sync operation + */ + private async performDailySync(): Promise { + if (!this.s3Sync) { + logger.error("S3 sync not available for daily sync"); + return; + } + + const startTime = new Date(); + logger.info(`Starting daily S3 sync at ${startTime.toISOString()}`); + + try { + await this.s3Sync.syncJobsToS3(); + const endTime = new Date(); + const duration = endTime.getTime() - startTime.getTime(); + logger.info(`Daily S3 sync completed successfully in ${duration}ms`); + } catch (error) { + logger.error("Daily S3 sync failed:", error); + } + } + + /** + * Manually trigger a sync (useful for testing) + */ + async triggerManualSync(): Promise { + if (!this.s3Sync) { + logger.error("S3 sync not configured"); + return; + } + + logger.info("Triggering manual S3 sync..."); + await this.performDailySync(); + } + + /** + * Get the next scheduled run time + */ + private getNextRunTime(): string { + if (!this.cronJob) { + return "Not scheduled"; + } + + // Create a date object for midnight PST today + const now = new Date(); + const pstNow = new Date(now.toLocaleString("en-US", { timeZone: "America/Los_Angeles" })); + + // If it's past midnight today, next run is tomorrow at midnight + const nextRun = new Date(pstNow); + if (pstNow.getHours() > 0 || pstNow.getMinutes() > 0 || pstNow.getSeconds() > 0) { + nextRun.setDate(nextRun.getDate() + 1); + } + nextRun.setHours(0, 0, 0, 0); + + return nextRun.toLocaleString("en-US", { + timeZone: "America/Los_Angeles", + weekday: "long", + year: "numeric", + month: "long", + day: "numeric", + hour: "2-digit", + minute: "2-digit", + timeZoneName: "short" + }); + } + + /** + * Get scheduler status + */ + async getStatus(): Promise<{ + isConfigured: boolean; + isRunning: boolean; + nextRun: string; + syncStats?: { bucketName: string; region: string; keyPrefix: string; available: boolean }; + }> { + let syncStats; + if (this.s3Sync) { + try { + syncStats = await this.s3Sync.getSyncStats(); + } catch (error) { + logger.error("Failed to get sync stats:", error); + } + } + + return { + isConfigured: this.s3Sync !== null, + isRunning: this.cronJob !== null, + nextRun: this.getNextRunTime(), + syncStats, + }; + } +} + +// Export a singleton instance +export const dailyS3Scheduler = new DailyS3Scheduler(); \ No newline at end of file diff --git a/util/s3Sync.ts b/util/s3Sync.ts new file mode 100644 index 0000000..24e56eb --- /dev/null +++ b/util/s3Sync.ts @@ -0,0 +1,388 @@ +import { exec } from "child_process"; +import { promisify } from "util"; +import * as fs from "fs-extra"; +import * as path from "path"; +import { logger } from "../server.js"; +import { FolderPaths } from "./serverInit.js"; + +const execAsync = promisify(exec); + +interface S3SyncConfig { + bucketName: string; + region: string; + accessKeyId: string; + secretAccessKey: string; + keyPrefix?: string; +} + +export interface JobFolderStats { + jobId: string; + relativePath: string; + documentCount: number; + totalSizeBytes: number; + totalSizeMB: number; +} + +export interface JobsDirectoryAnalysis { + totalJobs: number; + totalDocuments: number; + totalSizeBytes: number; + totalSizeMB: number; + jobs: JobFolderStats[]; +} + +export class S3Sync { + private config: S3SyncConfig; + + constructor(config: S3SyncConfig) { + this.config = config; + } + + /** + * Sync the Jobs directory to S3 bucket using AWS CLI + */ + async syncJobsToS3(): Promise { + try { + logger.info("Starting S3 sync for Jobs directory using AWS CLI..."); + + const jobsPath = FolderPaths.Jobs; + const keyPrefix = this.config.keyPrefix || "jobs/"; + const s3Path = `s3://${this.config.bucketName}/${keyPrefix}`; + + // Check if Jobs directory exists + if (!(await fs.pathExists(jobsPath))) { + logger.warn(`Jobs directory does not exist: ${jobsPath}`); + return; + } + + // Check if AWS CLI is available + await this.checkAwsCli(); + + // Set AWS credentials as environment variables for the command + const env = { + ...process.env, + AWS_ACCESS_KEY_ID: this.config.accessKeyId, + AWS_SECRET_ACCESS_KEY: this.config.secretAccessKey, + AWS_DEFAULT_REGION: this.config.region + }; + + // Run AWS S3 sync command + const syncCommand = `aws s3 sync "${jobsPath}" "${s3Path}"`; + + logger.info(`Executing AWS S3 sync command`); + + const { stdout, stderr } = await execAsync(syncCommand, { + env, + maxBuffer: 1024 * 1024 * 10 // 10MB buffer for large sync outputs + }); + + if (stdout) { + logger.info("S3 sync output:", stdout); + } + + if (stderr) { + logger.warn("S3 sync warnings:", stderr); + } + + logger.info("S3 sync completed successfully using AWS CLI"); + } catch (error) { + logger.error("S3 sync failed:", error); + throw error; + } + } + + /** + * Check if AWS CLI is available + */ + private async checkAwsCli(): Promise { + try { + await execAsync("aws --version"); + } catch (error) { + throw new Error( + "AWS CLI not found. Please install AWS CLI: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html" + ); + } + } + + /** + * Test S3 connection using AWS CLI + */ + async testConnection(): Promise { + try { + // Check if AWS CLI is available first + await this.checkAwsCli(); + + // Set AWS credentials as environment variables + const env = { + ...process.env, + AWS_ACCESS_KEY_ID: this.config.accessKeyId, + AWS_SECRET_ACCESS_KEY: this.config.secretAccessKey, + AWS_DEFAULT_REGION: this.config.region + }; + + // Test connection by listing bucket + const testCommand = `aws s3 ls s3://${this.config.bucketName} --max-items 1`; + await execAsync(testCommand, { env }); + + logger.info(`S3 connection test successful for bucket: ${this.config.bucketName}`); + return true; + } catch (error) { + logger.error("S3 connection test failed:", error); + return false; + } + } + + /** + * Get sync statistics using AWS CLI + */ + async getSyncStats(): Promise<{ bucketName: string; region: string; keyPrefix: string; available: boolean }> { + const available = await this.testConnection(); + return { + bucketName: this.config.bucketName, + region: this.config.region, + keyPrefix: this.config.keyPrefix || "jobs/", + available + }; + } + + /** + * Analyze all job folders in the Jobs directory + * Returns detailed statistics for each job folder + */ + async analyzeJobsDirectory(): Promise { + try { + logger.info("Starting Jobs directory analysis..."); + + const jobsPath = FolderPaths.Jobs; + + // Check if Jobs directory exists + if (!(await fs.pathExists(jobsPath))) { + logger.warn(`Jobs directory does not exist: ${jobsPath}`); + return { + totalJobs: 0, + totalDocuments: 0, + totalSizeBytes: 0, + totalSizeMB: 0, + jobs: [] + }; + } + + const jobFolders = await fs.readdir(jobsPath); + const jobStats: JobFolderStats[] = []; + let totalDocuments = 0; + let totalSizeBytes = 0; + + for (const jobFolder of jobFolders) { + const jobFolderPath = path.join(jobsPath, jobFolder); + const stat = await fs.stat(jobFolderPath); + + // Only process directories + if (stat.isDirectory()) { + const folderStats = await this.analyzeJobFolder(jobsPath, jobFolder); + jobStats.push(folderStats); + totalDocuments += folderStats.documentCount; + totalSizeBytes += folderStats.totalSizeBytes; + } + } + + const analysis: JobsDirectoryAnalysis = { + totalJobs: jobStats.length, + totalDocuments, + totalSizeBytes, + totalSizeMB: Math.round((totalSizeBytes / (1024 * 1024)) * 100) / 100, + jobs: jobStats.sort((a, b) => a.jobId.localeCompare(b.jobId)) + }; + + logger.info( + `Jobs directory analysis complete: ${analysis.totalJobs} jobs, ${analysis.totalDocuments} documents, ${analysis.totalSizeMB} MB` + ); + return analysis; + } catch (error) { + logger.error("Failed to analyze Jobs directory:", error); + throw error; + } + } + + /** + * Analyze a single job folder + */ + private async analyzeJobFolder(jobsPath: string, jobId: string): Promise { + const jobFolderPath = path.join(jobsPath, jobId); + const relativePath = path.relative(FolderPaths.Root, jobFolderPath); + + const { documentCount, totalSizeBytes } = await this.getDirectoryStats(jobFolderPath); + + return { + jobId, + relativePath, + documentCount, + totalSizeBytes, + totalSizeMB: Math.round((totalSizeBytes / (1024 * 1024)) * 100) / 100 + }; + } + + /** + * Recursively get document count and total size for a directory + */ + private async getDirectoryStats(dirPath: string): Promise<{ documentCount: number; totalSizeBytes: number }> { + let documentCount = 0; + let totalSizeBytes = 0; + + try { + const items = await fs.readdir(dirPath); + + for (const item of items) { + const itemPath = path.join(dirPath, item); + const stat = await fs.stat(itemPath); + + if (stat.isDirectory()) { + // Recursively analyze subdirectories + const subStats = await this.getDirectoryStats(itemPath); + documentCount += subStats.documentCount; + totalSizeBytes += subStats.totalSizeBytes; + } else { + // Count files as documents + documentCount++; + totalSizeBytes += stat.size; + } + } + } catch (error) { + logger.error(`Error analyzing directory ${dirPath}:`, error); + } + + return { documentCount, totalSizeBytes }; + } +} + +/** + * Create S3Sync instance from environment variables + */ +export function createS3SyncFromEnv(): S3Sync | null { + const bucketName = process.env.S3_BUCKET_NAME || "test"; + const region = process.env.S3_REGION || "ca-central-1"; + const accessKeyId = process.env.S3_ACCESS_KEY_ID || "key"; + const secretAccessKey = process.env.S3_SECRET_ACCESS_KEY || "secret"; + const keyPrefix = process.env.S3_KEY_PREFIX || "prefix"; + + if (!bucketName || !accessKeyId || !secretAccessKey) { + logger.warn( + "S3 configuration incomplete. Required env vars: S3_BUCKET_NAME, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY" + ); + return null; + } + + return new S3Sync({ + bucketName, + region, + accessKeyId, + secretAccessKey, + keyPrefix + }); +} + +/** + * Standalone function to analyze Jobs directory without S3 configuration + */ +export async function analyzeJobsDirectory(): Promise { + try { + logger.info("Starting Jobs directory analysis..."); + + const jobsPath = FolderPaths.Jobs; + + // Check if Jobs directory exists + if (!(await fs.pathExists(jobsPath))) { + logger.warn(`Jobs directory does not exist: ${jobsPath}`); + return { + totalJobs: 0, + totalDocuments: 0, + totalSizeBytes: 0, + totalSizeMB: 0, + jobs: [] + }; + } + + const jobFolders = await fs.readdir(jobsPath); + const jobStats: JobFolderStats[] = []; + let totalDocuments = 0; + let totalSizeBytes = 0; + + for (const jobFolder of jobFolders) { + const jobFolderPath = path.join(jobsPath, jobFolder); + const stat = await fs.stat(jobFolderPath); + + // Only process directories + if (stat.isDirectory()) { + const folderStats = await analyzeJobFolder(jobsPath, jobFolder); + jobStats.push(folderStats); + totalDocuments += folderStats.documentCount; + totalSizeBytes += folderStats.totalSizeBytes; + } + } + + const analysis: JobsDirectoryAnalysis = { + totalJobs: jobStats.length, + totalDocuments, + totalSizeBytes, + totalSizeMB: Math.round((totalSizeBytes / (1024 * 1024)) * 100) / 100, + jobs: jobStats.sort((a, b) => a.jobId.localeCompare(b.jobId)) + }; + + logger.info( + `Jobs directory analysis complete: ${analysis.totalJobs} jobs, ${analysis.totalDocuments} documents, ${analysis.totalSizeMB} MB` + ); + return analysis; + } catch (error) { + logger.error("Failed to analyze Jobs directory:", error); + throw error; + } +} + +/** + * Analyze a single job folder (standalone helper function) + */ +async function analyzeJobFolder(jobsPath: string, jobId: string): Promise { + const jobFolderPath = path.join(jobsPath, jobId); + const relativePath = path.relative(FolderPaths.Root, jobFolderPath); + + const { documentCount, totalSizeBytes } = await getDirectoryStats(jobFolderPath); + + return { + jobId, + relativePath, + documentCount, + totalSizeBytes, + totalSizeMB: Math.round((totalSizeBytes / (1024 * 1024)) * 100) / 100 + }; +} + +/** + * Recursively get document count and total size for a directory (standalone helper function) + */ +async function getDirectoryStats(dirPath: string): Promise<{ documentCount: number; totalSizeBytes: number }> { + let documentCount = 0; + let totalSizeBytes = 0; + + try { + const items = await fs.readdir(dirPath); + + for (const item of items) { + const itemPath = path.join(dirPath, item); + const stat = await fs.stat(itemPath); + + if (stat.isDirectory()) { + // Recursively analyze subdirectories + const subStats = await getDirectoryStats(itemPath); + documentCount += subStats.documentCount; + totalSizeBytes += subStats.totalSizeBytes; + } else { + // Count files as documents + documentCount++; + totalSizeBytes += stat.size; + } + } + } catch (error) { + logger.error(`Error analyzing directory ${dirPath}:`, error); + } + + return { documentCount, totalSizeBytes }; +}