diff --git a/util/s3Sync.ts b/util/s3Sync.ts index be9cb50..5482aae 100644 --- a/util/s3Sync.ts +++ b/util/s3Sync.ts @@ -8,6 +8,8 @@ import { FolderPaths } from "./serverInit.js"; import axios from "axios"; import { UUID } from "crypto"; import fsPromises from "fs/promises"; +import crypto from "crypto"; +import { createReadStream } from "fs"; const execAsync = promisify(exec); @@ -24,6 +26,8 @@ export interface JobFolderStats { jobid: UUID | string | null; //relativePath: string; document_count: number; + unique_document_count: number; + duplicate_count: number; total_size_bytes: number; total_size_mb: number; file_type_stats: { [extension: string]: number }; @@ -33,6 +37,8 @@ export interface JobsDirectoryAnalysis { bodyshopid: UUID; total_jobs: number; total_documents: number; + unique_documents: number; + duplicate_documents: number; total_size_bytes: number; total_size_mb: number; file_type_stats: { [extension: string]: number }; @@ -203,6 +209,8 @@ export async function analyzeJobsDirectory(): Promise { bodyshopid, total_jobs: 0, total_documents: 0, + unique_documents: 0, + duplicate_documents: 0, total_size_bytes: 0, total_size_mb: 0, file_type_stats: {}, @@ -216,6 +224,8 @@ export async function analyzeJobsDirectory(): Promise { const jobFolders = await readdir(jobsPath); let total_documents = 0; + let total_unique_documents = 0; + let total_duplicate_documents = 0; let total_size_bytes = 0; let total_jobs = 0; const aggregated_file_type_stats: { [extension: string]: number } = {}; @@ -237,6 +247,8 @@ export async function analyzeJobsDirectory(): Promise { total_jobs++; total_documents += folderStats.document_count; + total_unique_documents += folderStats.unique_document_count; + total_duplicate_documents += folderStats.duplicate_count; total_size_bytes += folderStats.total_size_bytes; // Aggregate file type stats @@ -268,6 +280,8 @@ export async function analyzeJobsDirectory(): Promise { bodyshopid, //read from the config.json file in the root directory total_jobs, total_documents, + unique_documents: total_unique_documents, + duplicate_documents: total_duplicate_documents, total_size_bytes, total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100, file_type_stats: aggregated_file_type_stats, @@ -275,9 +289,9 @@ export async function analyzeJobsDirectory(): Promise { }; logger.info( - `Jobs directory analysis complete: ${analysis.total_jobs} jobs, ${analysis.total_documents} documents, ${analysis.total_size_mb} MB` + `Jobs directory analysis complete: ${analysis.total_jobs} jobs, ${analysis.total_documents} documents (${analysis.unique_documents} unique, ${analysis.duplicate_documents} duplicates), ${analysis.total_size_mb} MB` ); - + //Add an upload to the IO database to categorize all of this. const apiURL = process.env.IS_TEST ? "https://api.test.imex.online/analytics/documents" @@ -311,7 +325,8 @@ export async function analyzeJobsDirectory(): Promise { async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUID): Promise { const jobFolderPath = path.join(jobsPath, jobid); - const { document_count, total_size_bytes, file_type_stats } = await getDirectoryStats(jobFolderPath); + const { document_count, unique_document_count, duplicate_count, total_size_bytes, file_type_stats } = + await getDirectoryStats(jobFolderPath); const uuidRegex = /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/; let validJobid: UUID | string | null = null; @@ -327,6 +342,8 @@ async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUI bodyshopid, jobid: validJobid, document_count, + unique_document_count, + duplicate_count, total_size_bytes, total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100, file_type_stats @@ -334,14 +351,40 @@ async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUI } /** - * Recursively get document count and total size for a directory (helper function) + * Calculate SHA256 hash of a file's content */ -async function getDirectoryStats( - dirPath: string -): Promise<{ document_count: number; total_size_bytes: number; file_type_stats: { [extension: string]: number } }> { +async function calculateFileHash(filePath: string): Promise { + return new Promise((resolve, reject) => { + const hash = crypto.createHash("sha256"); + const stream = createReadStream(filePath); + + stream.on("data", (data: string | Buffer) => hash.update(data)); + stream.on("end", () => resolve(hash.digest("hex"))); + stream.on("error", (error: Error) => { + console.error("Hashing error:", error); + reject(error); + }); + }); +} + +/** + * Recursively get document count and total size for a directory (helper function) + * Now with duplicate detection using content-based hashing + */ +async function getDirectoryStats(dirPath: string): Promise<{ + document_count: number; + unique_document_count: number; + duplicate_count: number; + total_size_bytes: number; + file_type_stats: { [extension: string]: number }; + contentHashes: Set; +}> { let document_count = 0; + let unique_document_count = 0; + let duplicate_count = 0; let total_size_bytes = 0; const file_type_stats: { [extension: string]: number } = {}; + const contentHashes = new Set(); try { const items = await readdir(dirPath); @@ -360,12 +403,19 @@ async function getDirectoryStats( // Recursively analyze subdirectories const subStats = await getDirectoryStats(itemPath); document_count += subStats.document_count; + unique_document_count += subStats.unique_document_count; + duplicate_count += subStats.duplicate_count; total_size_bytes += subStats.total_size_bytes; // Merge file type stats for (const [ext, count] of Object.entries(subStats.file_type_stats)) { file_type_stats[ext] = (file_type_stats[ext] || 0) + count; } + + // Merge content hashes to detect duplicates across subdirectories + for (const hash of subStats.contentHashes) { + contentHashes.add(hash); + } } else { // Count files as documents document_count++; @@ -373,14 +423,53 @@ async function getDirectoryStats( // Track file extension const ext = path.extname(item).toLowerCase() || "no-extension"; - file_type_stats[ext] = (file_type_stats[ext] || 0) + 1; + + // Calculate content hash for image files to detect duplicates + const isImageFile = [ + ".jpg", + ".jpeg", + ".png", + ".gif", + ".bmp", + ".webp", + ".heic", + ".heif", + ".tiff", + ".tif" + ].includes(ext); + + if (isImageFile) { + try { + const fileHash = await calculateFileHash(itemPath); + + if (contentHashes.has(fileHash)) { + // This is a duplicate - don't count in file_type_stats + duplicate_count++; + } else { + // This is unique + contentHashes.add(fileHash); + unique_document_count++; + // Only count unique files in file_type_stats + file_type_stats[ext] = (file_type_stats[ext] || 0) + 1; + } + } catch (hashError) { + logger.warning(`Failed to hash file ${itemPath}:`, hashError); + // If hashing fails, count as unique to avoid losing data + unique_document_count++; + file_type_stats[ext] = (file_type_stats[ext] || 0) + 1; + } + } else { + // Non-image files are counted as unique (not checking for duplicates) + unique_document_count++; + file_type_stats[ext] = (file_type_stats[ext] || 0) + 1; + } } } } catch (error) { logger.error(`Error analyzing directory ${dirPath}:`, error); } - return { document_count, total_size_bytes, file_type_stats }; + return { document_count, unique_document_count, duplicate_count, total_size_bytes, file_type_stats, contentHashes }; } let bodyshopid: UUID;